In [8]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/reciprocal_best_hits/01_get_best_hits")

library(pafr)
library(ggplot2)
library(ggrepel)
library(dplyr, quietly = TRUE)
library(data.table)
library(GenomicRanges)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

In [None]:
for (i in seq_along(myo_meta$Abbr)) {
  
  for(j in seq_along(myo_meta$Abbr)) {
    
    if (i != j) {

      paf1 <- read_paf(list.files(
        path = "resources/pafs/trimmed/", 
        pattern = paste0(myo_meta$Abbr[i], ".*", myo_meta$Abbr[j]),
        full.names = TRUE
      ), include_tags = FALSE)
      
      paf2 <- read_paf(list.files(
        path = "resources/pafs/trimmed/", 
        pattern = paste0(myo_meta$Abbr[j], ".*", myo_meta$Abbr[i]),
        full.names = TRUE
      ), include_tags = FALSE)
      
      chromsize <- fread(paste0("/Users/rebecca/sudmant/analyses/myotis/data/", 
                                myo_meta$Abbr[i], "_chromsizes"), 
                         data.table = FALSE)
      colnames(chromsize) <- c("Chr", "Chr_Length")

      rbhs <- fread(
        paste0("results/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_best_hits.csv"), 
        data.table = FALSE
      )
      
      ## Get fraction of genome with hits in PAF1

      paf1 <- paf1 %>%
        arrange(desc(nmatch)) %>%
        dplyr::mutate(
          seqnames = tname,
          start = tstart,
          end = tend
        )
      paf1_ranges <- GenomicRanges::makeGRangesFromDataFrame(paf1, ignore.strand = T)
      paf1_ranges <- paf1_ranges[unique(findOverlaps(paf1_ranges, type = "any", select = "first")),]
      paf1_df <- as.data.frame(paf1_ranges)

      paf1_lengths <- paf1_df %>%
        dplyr::mutate(
          Chr = sapply(strsplit(as.character(seqnames), ".", fixed = T), 
                       function(x) x[length(x)]),
          Length = end - start
        ) %>%
        dplyr::group_by(Chr) %>%
        dplyr::summarise(Recip_Length = sum(Length))
      
      paf1_lengths <- merge(chromsize, paf1_lengths, by = 1, all = TRUE)
      paf1_lengths$Recip_Length[is.na(paf1_lengths$Recip_Length)] <- 0
      paf1_lengths$Frac = with(paf1_lengths, Recip_Length / Chr_Length)
      paf1_lengths$Frac_PAF <- paf1_lengths$Frac

      ## Get fraction of genome with hits in PAF2

      paf2 <- paf2 %>%
        arrange(desc(nmatch)) %>%
        dplyr::mutate(
          seqnames = qname,
          start = qstart,
          end = qend
        )
      paf2_ranges <- GenomicRanges::makeGRangesFromDataFrame(paf2, ignore.strand = T)
      paf2_ranges <- paf2_ranges[unique(findOverlaps(paf2_ranges, type = "any", select = "first")),]
      paf2_df <- as.data.frame(paf2_ranges)

      paf2_lengths <- paf2_df %>%
        dplyr::mutate(
          Chr = sapply(strsplit(as.character(seqnames), ".", fixed = T), 
                      function(x) x[length(x)]),
          Length = end - start
        ) %>%
        dplyr::group_by(Chr) %>%
        dplyr::summarise(Recip_Length = sum(Length))
      paf2_lengths <- merge(chromsize, paf2_lengths, by = 1, all = TRUE)
      paf2_lengths$Recip_Length[is.na(paf2_lengths$Recip_Length)] <- 0
      paf2_lengths$Frac = with(paf2_lengths, Recip_Length / Chr_Length)
      paf2_lengths$Frac_PAF <- paf2_lengths$Frac

      ## Get fraction of genome with hits after restricting to RBHs

      rbhs1 <- rbhs %>%
        dplyr::select(Spec1_Chr, Spec1_Start, Spec1_End)

      rbh1_lengths <- rbhs %>%
        dplyr::mutate(
          Chr = Spec1_Chr,
          Length = Spec1_End - Spec1_Start
        ) %>%
        dplyr::group_by(Chr) %>%
        dplyr::summarise(Recip_Length = sum(Length))
      rbh1_lengths <- merge(chromsize, rbh1_lengths, by = 1, all = TRUE)
      rbh1_lengths$Recip_Length[is.na(rbh1_lengths$Recip_Length)] <- 0
      rbh1_lengths$Frac = with(rbh1_lengths, Recip_Length / Chr_Length)
      rbh1_lengths$Frac_RBHs <- rbh1_lengths$Frac

      ## Plot comparisons
      
      compare_hits1 <- merge(paf1_lengths, rbh1_lengths, by = "Chr")
      compare_hits2 <- merge(paf2_lengths, rbh1_lengths, by = "Chr")
      compare_pafs <- merge(paf1_lengths, paf2_lengths, by = "Chr")

      pdf(file = paste0("results/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_best_hit_concordance.pdf"),
          width = 8, height = 8)
      
      print(
        ggplot(compare_hits1, aes(x = Frac_PAF, y = Frac_RBHs)) +
          geom_point(size = 1) +
          geom_text(aes(label = Chr)) +
          theme_minimal() +
          theme(axis.title.x = element_text(margin = margin(t = 10)),
                axis.title.y = element_text(margin = margin(r = 10)),
                plot.margin = unit(c(1, 1, 1, 1), "cm")) +
          labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                             "genome with hits in", myo_meta$Field_Name[j], "genome")) +
          xlab("PAF 1") + ylab("RBHs")
      )
        
      print(
        ggplot(compare_hits2, aes(x = Frac_PAF, y = Frac_RBHs)) +
          geom_point(size = 1) +
          geom_text(aes(label = Chr)) +
          theme_minimal() +
          theme(axis.title.x = element_text(margin = margin(t = 10)),
                axis.title.y = element_text(margin = margin(r = 10)),
                plot.margin = unit(c(1, 1, 1, 1), "cm")) +
          labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                             "genome with hits in", myo_meta$Field_Name[j], "genome")) +
          xlab("PAF 2") + ylab("RBHs")
      )
        
      print(
        ggplot(compare_pafs, aes(x = Frac_PAF.x, y = Frac_PAF.y)) +
          geom_point(size = 1) +
          geom_text(aes(label = Chr)) +
          theme_minimal() +
          theme(axis.title.x = element_text(margin = margin(t = 10)),
                axis.title.y = element_text(margin = margin(r = 10)),
                plot.margin = unit(c(1, 1, 1, 1), "cm")) +
          labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                             "genome with hits in", myo_meta$Field_Name[j], "genome")) +
          xlab("PAF 1") + ylab("PAF 2")
      ) 
    
      dev.off()
      
    }
    
  }
}

In [6]:
paf1 <- read_paf(list.files(
  path = "resources/pafs/trimmed/", 
  pattern = paste0(myo_meta$Abbr[i], ".*", myo_meta$Abbr[j]),
  full.names = TRUE
), include_tags = FALSE)

paf2 <- read_paf(list.files(
  path = "resources/pafs/trimmed/", 
  pattern = paste0(myo_meta$Abbr[j], ".*", myo_meta$Abbr[i]),
  full.names = TRUE
), include_tags = FALSE)

chromsize <- fread(paste0("/Users/rebecca/sudmant/analyses/myotis/data/", 
                          myo_meta$Abbr[i], "_chromsizes"), 
                    data.table = FALSE)
colnames(chromsize) <- c("Chr", "Chr_Length")

rbhs <- fread(
  paste0("results/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_best_hits.csv"), 
  data.table = FALSE
)


In [None]:

## Get fraction of genome with hits in PAF1

paf1 <- paf1 %>%
  arrange(desc(nmatch)) %>%
  dplyr::mutate(
    seqnames = tname,
    start = tstart,
    end = tend
  )
paf1_ranges <- GenomicRanges::makeGRangesFromDataFrame(paf1, ignore.strand = T)
paf1_ranges <- paf1_ranges[unique(findOverlaps(paf1_ranges, type = "any", select = "first")),]
paf1_df <- as.data.frame(paf1_ranges)

paf1_lengths <- paf1_df %>%
  dplyr::mutate(
    Chr = sapply(strsplit(as.character(seqnames), ".", fixed = T), 
                  function(x) x[length(x)]),
    Length = end - start
  ) %>%
  dplyr::group_by(Chr) %>%
  dplyr::summarise(Recip_Length = sum(Length))

paf1_lengths <- merge(chromsize, paf1_lengths, by = 1, all = TRUE)
paf1_lengths$Recip_Length[is.na(paf1_lengths$Recip_Length)] <- 0
paf1_lengths$Frac = with(paf1_lengths, Recip_Length / Chr_Length)
paf1_lengths$Frac_PAF <- paf1_lengths$Frac

## Get fraction of genome with hits in PAF2

paf2 <- paf2 %>%
  arrange(desc(nmatch)) %>%
  dplyr::mutate(
    seqnames = qname,
    start = qstart,
    end = qend
  )
paf2_ranges <- GenomicRanges::makeGRangesFromDataFrame(paf2, ignore.strand = T)
paf2_ranges <- paf2_ranges[unique(findOverlaps(paf2_ranges, type = "any", select = "first")),]
paf2_df <- as.data.frame(paf2_ranges)

paf2_lengths <- paf2_df %>%
  dplyr::mutate(
    Chr = sapply(strsplit(as.character(seqnames), ".", fixed = T), 
                function(x) x[length(x)]),
    Length = end - start
  ) %>%
  dplyr::group_by(Chr) %>%
  dplyr::summarise(Recip_Length = sum(Length))
paf2_lengths <- merge(chromsize, paf2_lengths, by = 1, all = TRUE)
paf2_lengths$Recip_Length[is.na(paf2_lengths$Recip_Length)] <- 0
paf2_lengths$Frac = with(paf2_lengths, Recip_Length / Chr_Length)
paf2_lengths$Frac_PAF <- paf2_lengths$Frac

## Get fraction of genome with hits after restricting to RBHs

rbhs1 <- rbhs %>%
  dplyr::select(Spec1_Chr, Spec1_Start, Spec1_End)

rbh1_lengths <- rbhs %>%
  dplyr::mutate(
    Chr = Spec1_Chr,
    Length = Spec1_End - Spec1_Start
  ) %>%
  dplyr::group_by(Chr) %>%
  dplyr::summarise(Recip_Length = sum(Length))
rbh1_lengths <- merge(chromsize, rbh1_lengths, by = 1, all = TRUE)
rbh1_lengths$Recip_Length[is.na(rbh1_lengths$Recip_Length)] <- 0
rbh1_lengths$Frac = with(rbh1_lengths, Recip_Length / Chr_Length)
rbh1_lengths$Frac_RBHs <- rbh1_lengths$Frac

## Plot comparisons

compare_hits1 <- merge(paf1_lengths, rbh1_lengths, by = "Chr")
compare_hits2 <- merge(paf2_lengths, rbh1_lengths, by = "Chr")
compare_pafs <- merge(paf1_lengths, paf2_lengths, by = "Chr")

pdf(file = paste0("results/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_best_hit_concordance.pdf"),
    width = 8, height = 8)

print(
  ggplot(compare_hits1, aes(x = Frac_PAF, y = Frac_RBHs)) +
    geom_point(size = 1) +
    geom_text(aes(label = Chr)) +
    theme_minimal() +
    theme(axis.title.x = element_text(margin = margin(t = 10)),
          axis.title.y = element_text(margin = margin(r = 10)),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                        "genome with hits in", myo_meta$Field_Name[j], "genome")) +
    xlab("PAF 1") + ylab("RBHs")
)
  
print(
  ggplot(compare_hits2, aes(x = Frac_PAF, y = Frac_RBHs)) +
    geom_point(size = 1) +
    geom_text(aes(label = Chr)) +
    theme_minimal() +
    theme(axis.title.x = element_text(margin = margin(t = 10)),
          axis.title.y = element_text(margin = margin(r = 10)),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                        "genome with hits in", myo_meta$Field_Name[j], "genome")) +
    xlab("PAF 2") + ylab("RBHs")
)
  
print(
  ggplot(compare_pafs, aes(x = Frac_PAF.x, y = Frac_PAF.y)) +
    geom_point(size = 1) +
    geom_text(aes(label = Chr)) +
    theme_minimal() +
    theme(axis.title.x = element_text(margin = margin(t = 10)),
          axis.title.y = element_text(margin = margin(r = 10)),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = paste("Fraction of", myo_meta$Field_Name[i], 
                        "genome with hits in", myo_meta$Field_Name[j], "genome")) +
    xlab("PAF 1") + ylab("PAF 2")
) 