In [1]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/reciprocal_best_hits/get_best_hits")

library(pafr)
library(dplyr, verbose = FALSE)
library(stringr)
library(data.table)

options(scipen = 100, digits = 10)
myo_meta <- read.csv("../../../data/myotis_meta.csv")

Loading required package: ggplot2


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last




In [2]:
for (i in seq_along(myo_meta$Abbr)) {
  
  for (j in seq_along(myo_meta$Abbr)) {

    if (i < j) {

      working_paf1 <- read_paf(file.path("resources/pafs", intersect(
        list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[j])),
        list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[i]))
      )))

      working_paf2 <- read_paf(file.path("resources/pafs", intersect(
        list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[i])),
        list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[j]))
      )))

      spec1_intersect <- fread(file.path("resources/beds", paste0(
        myo_meta$Abbr[j], "_vs_", myo_meta$Abbr[i], "_positions_intersect.bed"
      )), data.table = FALSE)

      spec2_intersect <- fread(file.path("resources/beds", paste0(
        myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_positions_intersect.bed"
      )), data.table = FALSE)

      working_paf1$qname <- gsub("SCAF", "SUPER", working_paf1$qname)
      working_paf2$qname <- gsub("SCAF", "SUPER", working_paf2$qname)
      working_paf1$tname <- gsub("SCAF", "SUPER", working_paf1$tname)
      working_paf2$tname <- gsub("SCAF", "SUPER", working_paf2$tname)
      spec1_intersect$V1 <- gsub("SCAF", "SUPER", spec1_intersect$V1)
      spec1_intersect$V8 <- gsub("SCAF", "SUPER", spec1_intersect$V8)
      spec2_intersect$V1 <- gsub("SCAF", "SUPER", spec2_intersect$V1)
      spec2_intersect$V8 <- gsub("SCAF", "SUPER", spec2_intersect$V8)

      ## Subset to BED files to intersecting sequences woth mappings that were found in both species:

      spec1_intersect$map_id <- paste(
        str_extract(spec1_intersect$V1, pattern = "SUPER.*"),
        as.character(spec1_intersect$V7), as.character(spec1_intersect$V15)
      )
      spec2_intersect$map_id <- paste(
        str_extract(spec2_intersect$V1, pattern = "SUPER.*"),
        as.character(spec2_intersect$V15), as.character(spec2_intersect$V7)
      )
      spec1_intersect <- spec1_intersect[spec1_intersect$map_id %in% spec2_intersect$map_id,]
      spec2_intersect <- spec2_intersect[spec2_intersect$map_id %in% spec1_intersect$map_id,]

      ## Subset PAF files to these sequences:

      spec1_intersect$id <- paste(
        str_extract(spec1_intersect$V1, pattern = "SUPER.*"), 
        as.character(spec1_intersect$V7)
      )
      spec2_intersect$id <- paste(
        str_extract(spec2_intersect$V1, pattern = "SUPER.*"), 
        as.character(spec2_intersect$V7)
      )

      working_paf1$id <- paste(
        str_extract(working_paf1$qname, pattern = "SUPER.*"),
        as.character(working_paf1$id)
      )
      working_paf2$id <- paste(
        str_extract(working_paf2$qname, pattern = "SUPER.*"),
        as.character(working_paf2$id)
      )

      working_paf1 <- working_paf1[working_paf1$id %in% spec1_intersect$id,]
      working_paf2 <- working_paf2[working_paf2$id %in% spec2_intersect$id,]

      ## PAF 1: Put intersecting positions in respective species' columns:

      ## Make PAF1 specific identifiers for each sequence:
      spec1_intersect$id <- paste(
        str_extract(spec1_intersect$V1, pattern = "SUPER.*"),
        as.character(spec1_intersect$V7)
      )
      spec2_intersect$id <- paste(
        str_extract(spec2_intersect$V1, pattern = "SUPER.*"),
        as.character(spec2_intersect$V15)
      )

      spec1_target_matches <- match(spec1_intersect$id, working_paf1$id)
      spec2_query_matches <- match(spec2_intersect$id, working_paf1$id)

      working_paf1_intersect <- working_paf1
      working_paf1_intersect$tstart[spec1_target_matches] <- spec1_intersect$V2
      working_paf1_intersect$tend[spec1_target_matches] <- spec1_intersect$V3
      working_paf1_intersect$qstart[spec2_query_matches] <- spec2_intersect$V2
      working_paf1_intersect$qend[spec2_query_matches] <- spec2_intersect$V3

      ## PAF 2: Put intersecting positions in respective species' columns:

      ## Make PAF2 specific identifiers for each sequence:
      spec1_intersect$id <- paste(
        str_extract(spec1_intersect$V1, pattern = "SUPER.*"),
        as.character(spec1_intersect$V15)
      )
      spec2_intersect$id <- paste(
        str_extract(spec2_intersect$V1, pattern = "SUPER.*"),
        as.character(spec2_intersect$V7)
      )

      spec2_target_matches <- match(spec2_intersect$id, working_paf2$id)
      spec1_query_matches <- match(spec1_intersect$id, working_paf2$id)

      working_paf2_intersect <- working_paf2
      working_paf2_intersect$tstart[spec2_target_matches] <- spec2_intersect$V2
      working_paf2_intersect$tend[spec2_target_matches] <- spec2_intersect$V3
      working_paf2_intersect$qstart[spec1_query_matches] <- spec1_intersect$V2
      working_paf2_intersect$qend[spec1_query_matches] <- spec1_intersect$V3

      ## Now get intersecting hits between PAFs:

      working_paf1_intersect$seq <- with(
        working_paf1_intersect, paste(tstart, tend, qstart, qend)
      )
      working_paf2_intersect$seq <- with(
        working_paf2_intersect, paste(qstart, qend, tstart, tend)
      )
      matching_seq <- intersect(
        working_paf1_intersect$seq, working_paf2_intersect$seq
      )
      working_paf1_intersect <- working_paf1_intersect[working_paf1_intersect$seq %in% matching_seq,]
      
      ## Save results:

      working_paf1_intersect$qname <- gsub("#", ".", working_paf1_intersect$qname, fixed = TRUE)
      working_paf2_intersect$qname <- gsub("#", ".", working_paf2_intersect$qname, fixed = TRUE)

      write.csv(working_paf1_intersect, file = paste0(
        "results/", myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_best_hits.csv"
      ), row.names = FALSE, quote = FALSE)

      write.csv(working_paf2_intersect, file = paste0(
        "results/", myo_meta$Abbr[j], "_vs_", myo_meta$Abbr[i], "_best_hits.csv"
      ), row.names = FALSE, quote = FALSE)

    }
  }
}