Skip to content

Commit

Permalink
Merge scripts for writing final report
Browse files Browse the repository at this point in the history
  • Loading branch information
stanikae committed Jul 15, 2020
1 parent 02384f5 commit 3366053
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 217 deletions.
14 changes: 7 additions & 7 deletions bin/07.poppunk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,40 +97,40 @@ if [[ "$MLSTscheme" == "spyogenes" ]]; then
head -n1 $poppunk_dir/gas_db/gas_db_clusters.csv > $poppunk_dir/assigned_gpscs.csv
grep -F -f $poppunk_dir/grep_list.txt $poppunk_dir/gas_db/gas_db_clusters.csv | sed 's|/.*/||g' >> $poppunk_dir/assigned_gpscs.csv
Rscript $SCRIPTS_DIR/csv2xlsx.R \
$poppunk_dir/assigned_gpscs.csv $reports_dir/assigned_gpscs.xlsx >> $project/tmp/poppunk_converting_csv.log
$poppunk_dir/assigned_gpscs.csv $reports_dir/07.GAS.assigned-gpscs.xlsx >> $project/tmp/07.GAS.gpscs.poppunk.csv2xlsx.log
# determine if the novel (NA) files are the same or not using the clusters.csv file
head -n1 $poppunk_dir/gas_db/gas_db_clusters.csv > $poppunk_dir/assigned_clusters.csv
grep -F -f $poppunk_dir/grep_list.txt $poppunk_dir/gas_db/gas_db_clusters.csv | sed 's|/.*/||g' >> $poppunk_dir/assigned_clusters.csv
Rscript $SCRIPTS_DIR/csv2xlsx.R \
$poppunk_dir/assigned_clusters.csv $reports_dir/assigned_clusters.xlsx >> $project/tmp/poppunk_converting_csv.log
$poppunk_dir/assigned_clusters.csv $reports_dir/07.GAS.assigned-clusters.xlsx >> $project/tmp/07.GAS.clusters.poppunk.csv2xlsx.log
elif [[ "$MLSTscheme" == "spneumoniae" ]]; then
echo -e "\t[`date +"%d-%b-%Y %T"`]\tCreating PopPunk GPSC output file for $MLSTscheme"
head -n1 $poppunk_dir/spn_db/spn_db_external_clusters.csv > $poppunk_dir/assigned_gpscs.csv
grep -F -f $poppunk_dir/grep_list.txt $poppunk_dir/spn_db/spn_db_external_clusters.csv | sed 's|/.*/||g' >> $poppunk_dir/assigned_gpscs.csv
Rscript $SCRIPTS_DIR/csv2xlsx.R \
$poppunk_dir/assigned_gpscs.csv $reports_dir/assigned_gpscs.xlsx >> $project/tmp/poppunk_converting_csv.log
$poppunk_dir/assigned_gpscs.csv $reports_dir/07.SPN.assigned-gpscs.xlsx >> $project/tmp/07.SPN.gpsc.poppunk.csv2xlsx.log
# determine if the novel (NA) files are the same or not using the clusters.csv file
head -n1 $poppunk_dir/spn_db/spn_db_clusters.csv > $poppunk_dir/assigned_clusters.csv
grep -F -f $poppunk_dir/grep_list.txt $poppunk_dir/spn_db/spn_db_clusters.csv | sed 's|/.*/||g' >> $poppunk_dir/assigned_clusters.csv
Rscript $SCRIPTS_DIR/csv2xlsx.R \
$poppunk_dir/assigned_clusters.csv $reports_dir/assigned_clusters.xlsx >> $project/tmp/poppunk_converting_csv.log
$poppunk_dir/assigned_clusters.csv $reports_dir/07.SPN.assigned-clusters.xlsx >> $project/tmp/07.SPN.clusters.poppunk.csv2xlsx.log
else
head -n1 $poppunk_dir/strain_db/strain_db_external_clusters.csv > $poppunk_dir/assigned_gpscs.csv
grep -F -f $poppunk_dir/grep_list.txt $poppunk_dir/strain_db/strain_db_external_clusters.csv | sed 's|/.*/||g' >> $poppunk_dir/assigned_gpscs.csv
Rscript $SCRIPTS_DIR/csv2xlsx.R \
$poppunk_dir/assigned_gpscs.csv $reports_dir/assigned_gpscs.xlsx >> $project/tmp/poppunk_converting_csv.log
$poppunk_dir/assigned_gpscs.csv $reports_dir/07.other.assigned-gpscs.xlsx >> $project/tmp/07.other.gpsc.poppunk.csv2xlsx.log
# determine if the novel (NA) files are the same or not using the clusters.csv file
head -n1 $poppunk_dir/strain_db/strain_db_clusters.csv > $poppunk_dir/assigned_clusters.csv
grep -F -f $poppunk_dir/grep_list.txt $poppunk_dir/strain_db/strain_db_clusters.csv | sed 's|/.*/||g' >> $poppunk_dir/assigned_clusters.csv
Rscript $SCRIPTS_DIR/csv2xlsx.R \
$poppunk_dir/assigned_clusters.csv $reports_dir/assigned_clusters.xlsx >> $project/tmp/poppunk_converting_csv.log
$poppunk_dir/assigned_clusters.csv $reports_dir/07.other.assigned-clusters.xlsx >> $project/tmp/07.other.poppunk.csv2xlsx.log
fi
# Save results in the Reports directory
echo -e "\t[`date +"%d-%b-%Y %T"`]\tCopy final PopPunk results to the Reports directory"
cp $poppunk_dir/*/*.{csv,nwk,dot} $poppunk_report/

else
echo -e "\t[`date +"%d-%b-%Y %T"`]\tNumber of samples too low to run PopPunk for ${projectName} ...... provide at least 3 samples"
echo -e "\t[`date +"%d-%b-%Y %T"`]\tNumber of samples too low to run PopPunk for ${projectName} ...... provide at least 4 samples"
fi
#Rscript bin/adding_poppunk_results.R \
#~/kedibone/35B-Isolates/Reports_35B-Isolates_11_Sep_2019 35B-Isolates_WGS-typing-report.xlsx \
Expand Down
153 changes: 153 additions & 0 deletions bin/09.merge_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env Rscript
#args = commandArgs(trailingOnly=TRUE)


#.libPaths("~/repos/jekesa/lib/Rlib")

library(plyr)
library(tidyverse)
library(tidyr)
library(dplyr)
library(purrr)
library(stringr)
library(openxlsx)
library(readxl)

######################
# get input using arguments
args <- commandArgs(TRUE)

#getwd()
dir <- file.path(args[2])
#print(dir)

countReads <- read_excel(paste(dir,"03.countReads.xlsx", sep = "/"), col_names = TRUE)
covDepth <- read_excel(paste(dir,"03.coverageDepth.xlsx", sep = "/"), col_names = TRUE)
bactIns <- read_excel(paste(dir,"04.bactInspector.xlsx", sep = "/"), col_names = TRUE)
conFin <- read_excel(paste(dir,"04.confindr.xlsx", sep = "/"), col_names = TRUE)
kraken <- read_excel(paste(dir, "04.kraken.xlsx", sep = "/"), col_names = TRUE)
metrics <- read_excel(paste(dir,"05.quast.xlsx", sep = "/"), col_names = TRUE)
mlst <- read_excel(paste(dir, "05.mlst.xlsx", sep = "/"), col_names = TRUE)
resFin <- read_excel(paste(dir, "06.resfinder.xlsx", sep = "/"), col_names = TRUE)
pointFin <- read_excel(paste(dir, "06.pointfinder.xlsx", sep = "/"), col_names = TRUE)
aribaAMR <- read_excel(paste(dir, "06.aribaAMR-known_variants.xlsx", sep = "/"), col_names = TRUE)
aribaVF <- read_excel(paste(dir, "06.aribaVFs-known_variants.xlsx", sep = "/"), col_names = TRUE)
aribaAMRn <- read_excel(paste(dir, "06.aribaAMR-novel_variants.xlsx", sep = "/"), col_names = TRUE)
aribaVFn <- read_excel(paste(dir, "06.aribaVFs-novel_variants.xlsx", sep = "/"), col_names = TRUE)
############# filter quast results ####################################
metrics1 <- metrics %>% dplyr::filter(Assembly %in% c("# contigs (>= 200 bp)", "Largest contig", "Total length","GC (%)", "N50"))
# remove additional strings to remain with only sample ID
colnames(metrics1) <- str_remove(colnames(metrics1), "_scaffolds|_assembly")
# transpose data using dplyr and tidyr
metrics2 <- metrics1 %>%
gather(assembly, metrics, -Assembly) %>%
spread(names(metrics1)[1], "metrics")
# reorder columns
col_order <- c("assembly","# contigs (>= 200 bp)", "GC (%)", "N50", "Largest contig", "Total length")
metrics2 <- metrics2 %>% select(col_order)

################# filter MLST results ##################################
mlst$FILE <- str_remove(mlst$FILE, "_scaffolds.fasta|_assembly.fasta")
################# filter and join ARIBA results #################################
colnames(aribaAMR) <- str_remove(colnames(aribaAMR), ".match")
colnames(aribaVF) <- str_remove(colnames(aribaVF), ".match")
colnames(aribaAMRn) <- str_remove(colnames(aribaAMRn), ".match")
colnames(aribaVFn) <- str_remove(colnames(aribaVFn), ".match")
# join ariba known variants
aribaAMR$aribaAMR <- rep("AMRvariants", nrow(aribaAMR))
aribaVF$aribaVFs <- rep("VFvariants", nrow(aribaVF))
aribaAMR <- aribaAMR %>% dplyr::select(name,aribaAMR, everything())
aribaVF <- aribaVF %>% dplyr::select(name,aribaVFs, everything())
ariba_df <- dplyr::full_join(aribaAMR, aribaVF, by='name')
# join ariba novel variants
aribaAMRn$aribaAMRnovel <- rep("AMR-novel-variants", nrow(aribaAMRn))
aribaVFn$aribaVFsnovel <- rep("VF-novel-variants", nrow(aribaVFn))
aribaAMRn <- aribaAMRn %>% dplyr::select(name,aribaAMRnovel, everything())
aribaVFn <- aribaVFn %>% dplyr::select(name,aribaVFsnovel, everything())
aribaNovel_df <- dplyr::full_join(aribaAMRn, aribaVFn, by='name')
################# rename colnames ######################################
names(countReads)[1] <- "SampleID"
names(covDepth)[1] <- "SampleID"
names(bactIns)[1] <- "SampleID"
names(conFin)[1] <- "SampleID"
colnames(kraken) <- c("SampleID","kraken2_match_#1","kraken2_match_#2","kraken2_match_#3","kraken2_match_#4","kraken2_X")
colnames(metrics2) <- c("SampleID","Contig.num", "Contigs.GC.content", "N50.value", "Longest.contig", "Total.bases.assembly")
names(mlst)[1] <- "SampleID"
names(mlst)[2] <- "Scheme.MLST"
names(resFin)[1] <- "SampleID"
names(pointFin)[1] <- "SampleID"
names(ariba_df)[1] <- "SampleID"
names(aribaNovel_df)[1] <- "SampleID"

################# remove unwanted columns ###############################
covDepth <- select(covDepth, -Est.GenomeSize)
kraken <- dplyr::select(kraken, -kraken2_X)

################# join data by group/section ############################
# metrics
metric_df <- plyr::join_all(list(countReads,covDepth,metrics2,mlst), by='SampleID', type='full')
# contamination check
contam_df <- plyr::join_all(list(bactIns,conFin,kraken), by='SampleID', type='full')
# CGE AMR and mutations
cge_df <- dplyr::full_join(resFin,pointFin, by='SampleID')


################ Join and write results to .xlsx file ###################
if (args[1] == "spneumoniae") {
# serotyping
seroba <- read_excel(paste(dir, "07.seroba.xlsx", sep = "/"), col_names = TRUE)
pili <- read_excel(paste(dir, "07.SPN-pili.xlsx", sep = "/"), col_names = TRUE)
pbp <- read_excel(paste(dir, "07.SPN-pbp-typing.xlsx", sep = "/"), col_names = TRUE)
# poppunk gpsc and clusters results
gpsc <- read_excel(paste(dir, "07.SPN.assigned-gpscs.xlsx", sep = "/"), col_names = TRUE)
clusters <- read_excel(paste(dir, "07.SPN.assigned-clusters.xlsx", sep = "/"), col_names = TRUE)
# remove additional strings to remain with only sample ID
colnames(gpsc) <- str_remove(colnames(gpsc), "_scaffolds.fasta|_assembly.fasta")
colnames(clusters) <- str_remove(colnames(clusters), "_scaffolds.fasta|_assembly.fasta")

#head(seroba)
names(seroba)[1] <- "SampleID"
names(pili)[1] <- "SampleID"
names(pbp)[1] <- "SampleID"
names(gpsc)[1] <- "SampleID"
names(clusters)[1] <- "SampleID"

# Merging the data sets
cmd_df <- plyr::join_all(list(gpsc,cluster,contam_df,metric_df,seroba,cge_df,pbp,ariba_df), by='SampleID', type='full')
# write results to xlsx file
openxlsx::write.xlsx(cmd_df, paste(dir, args[3], sep = "/"), overwrite = T)

} else if (args[1] == "spyogenes") {
pbp <- read_excel(paste(dir, "07.GAS-typing.xlsx07.GAS.assigned-gpscs.xlsx", sep = "/"), col_names = TRUE)
# poppunk gpsc and clusters results
gpsc <- read_excel(paste(dir, "07.GAS.assigned-gpscs.xlsx", sep = "/"), col_names = TRUE)
clusters <- read_excel(paste(dir, "07.GAS.assigned-clusters.xlsx", sep = "/"), col_names = TRUE)
# remove additional strings to remain with only sample ID
colnames(gpsc) <- str_remove(colnames(gpsc), "_scaffolds.fasta|_assembly.fasta")
colnames(clusters) <- str_remove(colnames(clusters), "_scaffolds.fasta|_assembly.fasta")
names(pbp)[1] <- "SampleID"
names(gpsc)[1] <- "SampleID"
names(clusters)[1] <- "SampleID"
# Merging the data sets
cmd_df <- plyr::join_all(list(gpsc,cluster,contam_df,metric_df,cge_df,pbp,ariba_df), by='SampleID', type='full')
# write results to xlsx file
openxlsx::write.xlsx(cmd_df, paste(dir, args[3], sep = "/"), overwrite = T)
} else if (args[1] == "senterica") {
sistrDF <- read_excel(paste(dir, "07.sistr.xlsx", sep = "/"), col_names = TRUE)
seqseroDF <- read_excel(paste(dir, "07.seqsero.xlsx", sep = "/"), col_names = TRUE)
names(sistrDF)[1] <- "SampleID"
names(seqseroDF)[1] <- "SampleID"
# Merging the data sets
cmd_df <- plyr::join_all(list(contam_df,metric_df,seqseroDF,sistrDF,cge_df,ariba_df), by='SampleID', type='full')
# write results to xlsx file
openxlsx::write.xlsx(cmd_df, paste(dir, args[3], sep = "/"), overwrite = T)

} else {
# Merging the three data sets, metrics, mlst, and serotyping
cmd_df <- plyr::join_all(list(contam_df,metric_df,cge_df,ariba_df), by='SampleID', type='full')

# write results to xlsx file
openxlsx::write.xlsx(cmd_df, paste(dir, args[3], sep = "/"), overwrite = T)
}


153 changes: 0 additions & 153 deletions bin/merging_reports.R

This file was deleted.

Loading

0 comments on commit 3366053

Please sign in to comment.