In [19]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/reciprocal_best_hits/01_RBHs")

library(pafr)
library(ggplot2)
library(dplyr, quietly = TRUE)
library(data.table)
library(GenomicRanges)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

paf_dir <- "/Users/rebecca/sudmant/analyses/myotis/data/pafs"

In [10]:
for (i in seq_along(myo_meta$Abbr)) {
  
  for(j in seq_along(myo_meta$Abbr)) {
    
    if (i != j) {

      paf1 <- read_paf(list.files(
        path = paf_dir, 
        pattern = paste0(myo_meta$Abbr[i], ".*", myo_meta$Abbr[j]),
        full.names = TRUE
      ), include_tags = FALSE)
      
      paf2 <- read_paf(list.files(
        path = paf_dir, 
        pattern = paste0(myo_meta$Abbr[j], ".*", myo_meta$Abbr[i]),
        full.names = TRUE
      ), include_tags = FALSE)
      
      chromsize <- fread(paste0("/Users/rebecca/sudmant/analyses/myotis/data/", 
                                myo_meta$Abbr[i], "_chromsizes"), 
                         data.table = FALSE)
      colnames(chromsize) <- c("Chr", "Chr_Length")

      rbhs <- fread(
        paste0("results/data/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_RBHs.csv"), 
        data.table = FALSE
      )
      
      ## Get fraction of genome with hits in PAF1

      paf1 <- paf1 %>%
        arrange(desc(nmatch)) %>%
        dplyr::mutate(
          tname = gsub("SCAF", "SUPER", gsub("#", ".", tname, fixed = TRUE)),
        ) %>%
        dplyr::mutate(
          seqnames = tname,
          start = tstart,
          end = tend
        )
      paf1_ranges <- GenomicRanges::makeGRangesFromDataFrame(paf1, ignore.strand = T)
      paf1_ranges <- paf1_ranges[unique(findOverlaps(paf1_ranges, type = "any", select = "first")),]
      paf1_df <- as.data.frame(paf1_ranges)

      paf1_lengths <- paf1_df %>%
        dplyr::mutate(
          Chr = sapply(strsplit(as.character(seqnames), ".", fixed = T), 
                       function(x) x[length(x)]),
          Length = end - start
        ) %>%
        dplyr::group_by(Chr) %>%
        dplyr::summarise(Recip_Length = sum(Length))
      
      paf1_lengths <- merge(chromsize, paf1_lengths, by = 1, all = TRUE)
      paf1_lengths$Recip_Length[is.na(paf1_lengths$Recip_Length)] <- 0
      paf1_lengths$Frac = with(paf1_lengths, Recip_Length / Chr_Length)
      paf1_lengths$Frac_PAF <- paf1_lengths$Frac

      ## Get fraction of genome with hits in PAF2

      paf2 <- paf2 %>%
        arrange(desc(nmatch)) %>%
        dplyr::mutate(
          qname = gsub("SCAF", "SUPER", gsub("#", ".", qname, fixed = TRUE)),
        ) %>%
        dplyr::mutate(
          seqnames = qname,
          start = qstart,
          end = qend
        )
      paf2_ranges <- GenomicRanges::makeGRangesFromDataFrame(paf2, ignore.strand = T)
      paf2_ranges <- paf2_ranges[unique(findOverlaps(paf2_ranges, type = "any", select = "first")),]
      paf2_df <- as.data.frame(paf2_ranges)

      paf2_lengths <- paf2_df %>%
        dplyr::mutate(
          Chr = sapply(strsplit(as.character(seqnames), ".", fixed = T), 
                      function(x) x[length(x)]),
          Length = end - start
        ) %>%
        dplyr::group_by(Chr) %>%
        dplyr::summarise(Recip_Length = sum(Length))
      paf2_lengths <- merge(chromsize, paf2_lengths, by = 1, all = TRUE)
      paf2_lengths$Recip_Length[is.na(paf2_lengths$Recip_Length)] <- 0
      paf2_lengths$Frac = with(paf2_lengths, Recip_Length / Chr_Length)
      paf2_lengths$Frac_PAF <- paf2_lengths$Frac

      ## Get fraction of genome with hits after restricting to RBHs

      rbhs1 <- rbhs %>%
        dplyr::select(Spec_Chr, Spec_Start, Spec_End)

      rbh1_lengths <- rbhs %>%
        dplyr::mutate(
          Chr = Spec_Chr,
          Length = Spec_End - Spec_Start
        ) %>%
        dplyr::group_by(Chr) %>%
        dplyr::summarise(Recip_Length = sum(Length))
      rbh1_lengths <- merge(chromsize, rbh1_lengths, by = 1, all = TRUE)
      rbh1_lengths$Recip_Length[is.na(rbh1_lengths$Recip_Length)] <- 0
      rbh1_lengths$Frac = with(rbh1_lengths, Recip_Length / Chr_Length)
      rbh1_lengths$Frac_RBHs <- rbh1_lengths$Frac

      ## Plot comparisons
      
      compare_hits1 <- merge(paf1_lengths, rbh1_lengths, by = "Chr")
      compare_hits2 <- merge(paf2_lengths, rbh1_lengths, by = "Chr")
      compare_pafs <- merge(paf1_lengths, paf2_lengths, by = "Chr")

      pdf(file = paste0("results/figures/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], 
                        "_RBHs_concordance.pdf"),
          width = 8.5, height = 8)
      
      print(
        ggplot(compare_hits1, aes(x = Frac_PAF, y = Frac_RBHs)) +
          geom_point(size = 1) +
          geom_text(aes(label = Chr)) +
          theme_minimal() +
          theme(axis.title.x = element_text(margin = margin(t = 10)),
                axis.title.y = element_text(margin = margin(r = 10)),
                plot.margin = unit(c(1, 1, 1, 1), "cm")) +
          labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                             "genome with hits in", myo_meta$Field_Name[j], "genome")) +
          xlab("PAF 1") + ylab("RBHs")
      )
        
      print(
        ggplot(compare_hits2, aes(x = Frac_PAF, y = Frac_RBHs)) +
          geom_point(size = 1) +
          geom_text(aes(label = Chr)) +
          theme_minimal() +
          theme(axis.title.x = element_text(margin = margin(t = 10)),
                axis.title.y = element_text(margin = margin(r = 10)),
                plot.margin = unit(c(1, 1, 1, 1), "cm")) +
          labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                             "genome with hits in", myo_meta$Field_Name[j], "genome")) +
          xlab("PAF 2") + ylab("RBHs")
      )
        
      print(
        ggplot(compare_pafs, aes(x = Frac_PAF.x, y = Frac_PAF.y)) +
          geom_point(size = 1) +
          geom_text(aes(label = Chr)) +
          theme_minimal() +
          theme(axis.title.x = element_text(margin = margin(t = 10)),
                axis.title.y = element_text(margin = margin(r = 10)),
                plot.margin = unit(c(1, 1, 1, 1), "cm")) +
          labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                             "genome with hits in", myo_meta$Field_Name[j], "genome")) +
          xlab("PAF 1") + ylab("PAF 2")
      ) 
    
      dev.off()
      
    }
    
  }
}

In [21]:
## Get fraction of genome in each species that has a RBH

pdf(file = paste0("results/figures/RBHs_genome_frac.pdf"), width = 8.5, height = 8)

lapply(seq_along(myo_meta$Abbr), function(i) {
  
  genome_frac_list <- lapply(seq_along(myo_meta$Abbr), function(j) {
    
    if (i != j) {
      
      chromsize <- fread(paste0("/Users/rebecca/sudmant/analyses/myotis/data/", 
                         myo_meta$Abbr[i], "_chromsizes"), 
                    data.table = FALSE)
      colnames(chromsize) <- c("Chr", "Chr_Length")

      rbhs <- fread(
        paste0("results/data/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_RBHs.csv"), 
        data.table = FALSE
      )

      ## Get fraction of genome with hits after restricting to RBHs

      rbhs1 <- rbhs %>%
        dplyr::select(Spec_Chr, Spec_Start, Spec_End)

      rbh1_lengths <- rbhs %>%
        dplyr::mutate(
          Chr = Spec_Chr,
          Length = Spec_End - Spec_Start
        ) %>%
        dplyr::group_by(Chr) %>%
        dplyr::summarise(Recip_Length = sum(Length))
      rbh1_lengths <- merge(chromsize, rbh1_lengths, by = 1, all = TRUE)
      rbh1_lengths$Recip_Length[is.na(rbh1_lengths$Recip_Length)] <- 0

      return(with(rbh1_lengths, sum(Recip_Length) / sum(Chr_Length)))
    
    }
    
  })

  df <- data.frame(Query = myo_meta$Field_Name[-i], 
                   Frac = unlist(genome_frac_list), 
                   row.names = NULL)
  
  df$Query <- factor(df$Query, levels = df$Query[rev(order(df$Frac))])
  
  print(
    ggplot(df, aes(x = Query, y = Frac)) +
      geom_bar(stat = "identity") +
      theme_minimal() +
      theme(axis.title.x = element_blank(),
            axis.text.x = element_text(angle = 45, hjust = 1),
            axis.title.y = element_text(margin = margin(r = 10)),
            plot.margin = unit(c(1, 1, 1, 1), "cm")) +
      labs(title = paste("Fraction of", myo_meta$Field_Name[i], "genome with RBHs")) +
      xlab("Query genome") + ylab("Fraction") +
      scale_y_continuous(limits = c(0, .6))
  )

})

dev.off()  

[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]
