In [2]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/reciprocal_best_hits/01_RBHs")

library(pafr)
library(dplyr)
library(tidyr)
library(data.table)
library(GenomicRanges)

source("code/rbh_fns.R")

options(scipen = 100, digits = 10)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

paf_dir <- "/Users/rebecca/sudmant/analyses/myotis/data/pafs"

## Here I try to identify RBHs between species

In [2]:
for (i in seq_along(myo_meta$Abbr)) {
  
  for (j in seq_along(myo_meta$Abbr)) {
    
    if (i < j) {
      
      print(paste(myo_meta$Abbr[i], myo_meta$Abbr[j]))
      
      ## Read in bi-directional PAFs for species 1 & 2
      
      paf1 <- read_paf(file.path(paf_dir, intersect(
        list.files(path = paf_dir, pattern = paste0("^", myo_meta$Abbr[i])),
        list.files(path = paf_dir, pattern = paste0("_", myo_meta$Abbr[j]))
      )), include_tags = FALSE)
      
      paf2 <- read_paf(file.path(paf_dir, intersect(
        list.files(path = paf_dir, pattern = paste0("^", myo_meta$Abbr[j])),
        list.files(path = paf_dir, pattern = paste0("_", myo_meta$Abbr[i]))
      )), include_tags = FALSE)

      paf1 <- paf1 %>%
        dplyr::mutate(
          qname = gsub("SCAF", "SUPER", gsub("#", ".", qname, fixed = TRUE)),
          tname = gsub("SCAF", "SUPER", gsub("#", ".", tname, fixed = TRUE))
        ) %>%
        dplyr::mutate(
          Query_ID = paste(qname, qstart, qend, row_number()),
          Subj_ID = paste(tname, tstart, tend, row_number())
        )
      paf2 <- paf2 %>%
        dplyr::mutate(
          qname = gsub("SCAF", "SUPER", gsub("#", ".", qname, fixed = TRUE)),
          tname = gsub("SCAF", "SUPER", gsub("#", ".", tname, fixed = TRUE))
        ) %>%
        dplyr::mutate(
          Query_ID = paste(qname, qstart, qend, row_number()),
          Subj_ID = paste(tname, tstart, tend, row_number())
        )
        
      ## Get data from each species for each PAF file
      
      ## Species 1 PAF data:

      spec1_subj_paf <- paf1 %>% 
        dplyr::select(chr = tname,
                      start = tstart, 
                      end = tend, 
                      name = Subj_ID)

      spec1_query_paf <- paf2 %>% 
        dplyr::select(chr = qname, 
                      start = qstart, 
                      end = qend, 
                      name = Query_ID)
        
      ## Species 2 PAF data:

      spec2_subj_paf <- paf2 %>% 
        dplyr::select(chr = tname,
                      start = tstart, 
                      end = tend, 
                      name = Subj_ID)
        
      spec2_query_paf <- paf1 %>% 
        dplyr::select(chr = qname, 
                      start = qstart, 
                      end = qend, 
                      name = Query_ID)
        
      ## Get intersection of regions in PAF files for species 1 & 2:
      
      spec1_intersect <- spec_intersect_fn(spec1_subj_paf,
                                           spec1_query_paf,
                                           spec.no = 1)
      spec2_intersect <- spec_intersect_fn(spec2_subj_paf, 
                                           spec2_query_paf,
                                           spec.no = 2)

      spec1_intersect <- spec1_intersect %>%
        dplyr::select(-Spec1_ID)
      spec2_intersect <- spec2_intersect %>%
        dplyr::select(-Spec2_ID)
        
      ## Merge species 1 intersected regions with original PAFs to get corresponding region in species 2 (and vice versa):

      paf1_spec1 <- paf1 %>%
        dplyr::mutate(
          Spec2_Query_ID = Query_ID,
          Paf1_Mapq = mapq
        ) %>%
        dplyr::select(-Query_ID) %>%
        dplyr::select(Spec2_Query_ID, Subj_ID, Paf1_Mapq)
              
      paf2_spec1 <- paf2 %>%
        dplyr::mutate(
          Spec2_Subj_ID = Subj_ID,
          Paf2_Mapq = mapq
        ) %>%
        dplyr::select(-Subj_ID) %>%
        dplyr::select(Spec2_Subj_ID, Query_ID, Paf2_Mapq)
      
      spec1_rbhs <- merge(spec1_intersect, paf2_spec1, by.x = "Spec1_Query_ID", by.y = "Query_ID")
      spec1_rbhs <- merge(spec1_rbhs, paf1_spec1, by.x = "Spec1_Subj_ID", by.y = "Subj_ID")
      
      paf1_spec2 <- paf1 %>%
        dplyr::mutate(
          Spec1_Subj_ID = Subj_ID,
          Paf1_Mapq = mapq
        ) %>%
        dplyr::select(-Subj_ID) %>%
        dplyr::select(Spec1_Subj_ID, Query_ID, Paf1_Mapq)
              
      paf2_spec2 <- paf2 %>%
        dplyr::mutate(
          Spec1_Query_ID = Query_ID,
          Paf2_Mapq = mapq
        ) %>%
        dplyr::select(-Query_ID) %>%
        dplyr::select(Spec1_Query_ID, Subj_ID, Paf2_Mapq)
      
      spec2_rbhs <- merge(spec2_intersect, paf2_spec2, by.x = "Spec2_Subj_ID", by.y = "Subj_ID")
      spec2_rbhs <- merge(spec2_rbhs, paf1_spec2, by.x = "Spec2_Query_ID", by.y = "Query_ID")
      
      ## Subset intersected PAFs to restrict to hits found in BOTH directions:
      
      spec1_rbhs <- spec1_rbhs[paste(spec1_rbhs$Spec2_Subj_ID, spec1_rbhs$Spec2_Query_ID) %in%
                                paste(spec2_rbhs$Spec2_Subj_ID, spec2_rbhs$Spec2_Query_ID),]
      spec2_rbhs <- spec2_rbhs[paste(spec2_rbhs$Spec1_Subj_ID, spec2_rbhs$Spec1_Query_ID) %in% 
                                paste(spec1_rbhs$Spec1_Subj_ID, spec1_rbhs$Spec1_Query_ID),]
      spec1_rbhs <- spec1_rbhs[match(paste(spec2_rbhs$Spec2_Subj_ID, spec2_rbhs$Spec2_Query_ID), 
                                     paste(spec1_rbhs$Spec2_Subj_ID, spec1_rbhs$Spec2_Query_ID)),]
      spec2_rbhs <- spec2_rbhs[match(paste(spec1_rbhs$Spec1_Subj_ID, spec1_rbhs$Spec1_Query_ID), 
                                     paste(spec2_rbhs$Spec1_Subj_ID, spec2_rbhs$Spec1_Query_ID)),]
           
      colnames(spec1_rbhs)[3:5] <- gsub("1", "", colnames(spec1_rbhs)[3:5])
      colnames(spec2_rbhs)[3:5] <- gsub("2", "", colnames(spec2_rbhs)[3:5])
      
      write.csv(spec1_rbhs, file = paste0(
        "results/data/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_RBHs.csv"
      ), row.names = FALSE, quote = FALSE)

      write.csv(spec2_rbhs, file = paste0(
        "results/data/", myo_meta$Abbr[j], "_vs_", myo_meta$Abbr[i], "_RBHs.csv"
      ), row.names = FALSE, quote = FALSE)
    
    }
  
  }
  
}

[1] "mMyoAui mMyoCai"
[1] "mMyoAui mMyoEvo"
[1] "mMyoAui mMyoLuc"
[1] "mMyoAui mMyoOcc"
[1] "mMyoAui mMyoThy"
[1] "mMyoAui mMyoVel"
[1] "mMyoAui mMyoVol"
[1] "mMyoAui mMyoYum"
[1] "mMyoCai mMyoEvo"
[1] "mMyoCai mMyoLuc"
[1] "mMyoCai mMyoOcc"
[1] "mMyoCai mMyoThy"
[1] "mMyoCai mMyoVel"
[1] "mMyoCai mMyoVol"
[1] "mMyoCai mMyoYum"
[1] "mMyoEvo mMyoLuc"
[1] "mMyoEvo mMyoOcc"
[1] "mMyoEvo mMyoThy"
[1] "mMyoEvo mMyoVel"
[1] "mMyoEvo mMyoVol"
[1] "mMyoEvo mMyoYum"
[1] "mMyoLuc mMyoOcc"
[1] "mMyoLuc mMyoThy"
[1] "mMyoLuc mMyoVel"
[1] "mMyoLuc mMyoVol"
[1] "mMyoLuc mMyoYum"
[1] "mMyoOcc mMyoThy"
[1] "mMyoOcc mMyoVel"
[1] "mMyoOcc mMyoVol"
[1] "mMyoOcc mMyoYum"
[1] "mMyoThy mMyoVel"
[1] "mMyoThy mMyoVol"
[1] "mMyoThy mMyoYum"
[1] "mMyoVel mMyoVol"
[1] "mMyoVel mMyoYum"
[1] "mMyoVol mMyoYum"
