In [26]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/reciprocal_best_hits/01_get_best_hits")

## note: pintersect() is a parallelized intersect

library(pafr)
library(dplyr)
library(tidyr)
library(data.table)
library(GenomicRanges)

source("code/rbh_fns.R")

options(scipen = 100, digits = 10)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

In [None]:
for (i in seq_along(myo_meta$Abbr)) {
  
  for (j in seq_along(myo_meta$Abbr)) {
    
    if(i < j) {
      
      print(paste(myo_meta$Abbr[i], myo_meta$Abbr[j]))
      
      paf1 <- read_paf(file.path("resources/pafs", intersect(
        list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[i])),
        list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[j]))
      )), include_tags = FALSE)
      
      paf2 <- read_paf(file.path("resources/pafs", intersect(
        list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[j])),
        list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[i]))
      )), include_tags = FALSE)

      paf1 <- paf1 %>%
        dplyr::mutate(
          Query_ID = paste(qname, qstart, qend, row_number()),
          Subj_ID = paste(tname, tstart, tend, row_number())
        )
      paf2 <- paf2 %>%
        dplyr::mutate(
          Query_ID = paste(qname, qstart, qend, row_number()),
          Subj_ID = paste(tname, tstart, tend, row_number())
        )
        
      ## Species 1 data:

      spec1_subj_paf <- paf1 %>% 
        dplyr::select(chr = tname,
                      start = tstart, 
                      end = tend, 
                      name = Subj_ID)

      spec1_query_paf <- paf2 %>% 
        dplyr::select(chr = qname, 
                      start = qstart, 
                      end = qend, 
                      name = Query_ID)
        
      ## Species 2 data:

      spec2_subj_paf <- paf2 %>% 
        dplyr::select(chr = tname,
                      start = tstart, 
                      end = tend, 
                      name = Subj_ID)
        
      spec2_query_paf <- paf1 %>% 
        dplyr::select(chr = qname, 
                      start = qstart, 
                      end = qend, 
                      name = Query_ID)
        
      ## Get intersection of regions for each species:
      
      spec1_intersect <- spec_intersect_fn(spec1_subj_paf,
                                           spec1_query_paf,
                                           spec.no = 1)
      spec2_intersect <- spec_intersect_fn(spec2_subj_paf, 
                                           spec2_query_paf,
                                           spec.no = 2)

      spec1_intersect <- spec1_intersect %>%
        dplyr::select(-Spec1_ID)
      spec2_intersect <- spec2_intersect %>%
        dplyr::select(-Spec2_ID)
        
      ## Merge intersected regions with original PAFs to get other species' region info:

      paf1_spec1 <- paf1 %>%
        dplyr::mutate(
          Spec2_Query_ID = Query_ID,
          Paf1_Mapq = mapq
        ) %>%
        dplyr::select(-Query_ID) %>%
        dplyr::select(Spec2_Query_ID, Subj_ID, Paf1_Mapq)
              
      paf2_spec1 <- paf2 %>%
        dplyr::mutate(
          Spec2_Subj_ID = Subj_ID,
          Paf2_Mapq = mapq
        ) %>%
        dplyr::select(-Subj_ID) %>%
        dplyr::select(Spec2_Subj_ID, Query_ID, Paf2_Mapq)
      
      spec1_rbhs <- merge(spec1_intersect, paf2_spec1, by.x = "Spec1_Query_ID", by.y = "Query_ID")
      spec1_rbhs <- merge(spec1_rbhs, paf1_spec1, by.x = "Spec1_Subj_ID", by.y = "Subj_ID")
      
      paf1_spec2 <- paf1 %>%
        dplyr::mutate(
          Spec1_Subj_ID = Subj_ID,
          Paf1_Mapq = mapq
        ) %>%
        dplyr::select(-Subj_ID) %>%
        dplyr::select(Spec1_Subj_ID, Query_ID, Paf1_Mapq)
              
      paf2_spec2 <- paf2 %>%
        dplyr::mutate(
          Spec1_Query_ID = Query_ID,
          Paf2_Mapq = mapq
        ) %>%
        dplyr::select(-Query_ID) %>%
        dplyr::select(Spec1_Query_ID, Subj_ID, Paf2_Mapq)

      spec2_rbhs <- merge(spec2_intersect, paf2_spec2, by.x = "Spec2_Subj_ID", by.y = "Subj_ID")
      spec2_rbhs <- merge(spec2_rbhs, paf1_spec2, by.x = "Spec2_Query_ID", by.y = "Query_ID")
      
      ## Restrict to hits that appeared in both PAFs:
      
      spec1_rbhs <- spec1_rbhs[paste(spec1_rbhs$Spec2_Subj_ID, spec1_rbhs$Spec2_Query_ID) %in%
                                paste(spec2_rbhs$Spec2_Subj_ID, spec2_rbhs$Spec2_Query_ID),]
      spec2_rbhs <- spec2_rbhs[paste(spec2_rbhs$Spec1_Subj_ID, spec2_rbhs$Spec1_Query_ID) %in% 
                                paste(spec1_rbhs$Spec1_Subj_ID, spec1_rbhs$Spec1_Query_ID),]
      spec1_rbhs <- spec1_rbhs[match(paste(spec2_rbhs$Spec2_Subj_ID, spec2_rbhs$Spec2_Query_ID), 
                                     paste(spec1_rbhs$Spec2_Subj_ID, spec1_rbhs$Spec2_Query_ID)),]
      spec2_rbhs <- spec1_rbhs[match(paste(spec1_rbhs$Spec1_Subj_ID, spec1_rbhs$Spec1_Query_ID), 
                                     paste(spec2_rbhs$Spec1_Subj_ID, spec2_rbhs$Spec1_Query_ID)),]
           
      write.csv(spec1_rbhs, file = paste0(
        "results/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_best_hits.csv"
      ), row.names = FALSE, quote = FALSE)

      write.csv(spec2_rbhs, file = paste0(
        "results/", myo_meta$Abbr[j], "_vs_", myo_meta$Abbr[i], "_best_hits.csv"
      ), row.names = FALSE, quote = FALSE)
    
    }
  
  }
  
}