In [187]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/reciprocal_best_hits")

library(pafr)
library(dplyr)
library(stringr)
library(data.table)

options(scipen = 100, digits = 10)

myo_meta <- read.csv("../../data/myotis_meta.csv")

In [207]:
i = 1
j = 2

working_paf1 <- read_paf(file.path("resources/pafs", intersect(
  list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[j])),
  list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[i]))
)))

working_paf2 <- read_paf(file.path("resources/pafs", intersect(
  list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[i])),
  list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[j]))
)))

## Get intersection of a species' sequences when they are target vs. the query:

spec1_bed <- fread(file.path("resources/beds", paste0(
  myo_meta$Abbr[j], "_vs_", myo_meta$Abbr[i], "_positions_intersect.bed"
)), data.table = FALSE)

spec2_bed <- fread(file.path("resources/beds", paste0(
  myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], "_positions_intersect.bed"
)), data.table = FALSE)

In [208]:
## Subset to intersections that appeared when a species was both target and query (for both species):

## Make sequence identifiers unique:
spec1_bed$id <- paste(str_extract(spec1_bed$V1, pattern = "SUPER.*"), as.character(spec1_bed$V7), as.character(spec1_bed$V15))
spec2_bed$id <- paste(str_extract(spec2_bed$V1, pattern = "SUPER.*"), as.character(spec2_bed$V15), as.character(spec2_bed$V7))

## Subset each species' bed files to shared sequences
spec1_bed <- spec1_bed[spec1_bed$id %in% spec2_bed$id,]
spec2_bed <- spec2_bed[spec2_bed$id %in% spec1_bed$id,]

## Subset PAF files to these sequences:
working_paf1 <- working_paf1[as.character(working_paf1$id) %in% as.character(spec1_bed$V7),]
working_paf2 <- working_paf2[as.character(working_paf2$id) %in% as.character(spec2_bed$V7),]

# spec2_bed <- spec2_bed[order(spec2_bed$id),]
# spec1_bed <- spec1_bed[order(spec1_bed$id),]

In [210]:
## PAF 1: Put intersecting positions in target and query columns:
spec1_target_matches <- match(as.character(spec1_bed$V7), as.character(working_paf1$id))
spec2_query_matches <- match(as.character(spec2_bed$V15), as.character(working_paf1$id))

working_paf1_repl <- working_paf1
working_paf1_repl$tstart[spec1_target_matches] <- spec1_bed$V2
working_paf1_repl$tend[spec1_target_matches] <- spec1_bed$V3
working_paf1_repl$qstart[spec2_query_matches] <- spec2_bed$V2
working_paf1_repl$qend[spec2_query_matches] <- spec2_bed$V3

In [217]:
## PAF 2: Put intersecting positions in target and query columns:
spec2_target_matches <- match(as.character(spec2_bed$V7), as.character(working_paf2$id))
spec1_query_matches <- match(as.character(spec1_bed$V15), as.character(working_paf2$id))

working_paf2_repl <- working_paf2
working_paf2_repl$tstart[spec2_target_matches] <- spec2_bed$V2
working_paf2_repl$tend[spec2_target_matches] <- spec2_bed$V3
working_paf2_repl$qstart[spec1_query_matches] <- spec1_bed$V2
working_paf2_repl$qend[spec1_query_matches] <- spec1_bed$V3

In [None]:
## How does PAF 1 compare to PAF 2?



In [None]:
for (i in seq_along(myo_meta$Abbr)) {
  
  for (j in seq_along(myo_meta$Abbr)) {

    if (i != j) {

      working_paf1 <- read_paf(file.path("resources/pafs", intersect(
        list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[j])),
        list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[i]))
      )))

      working_paf2 <- read_paf(file.path("resources/pafs", intersect(
        list.files(path = "resources/pafs", pattern = paste0("^", myo_meta$Abbr[i])),
        list.files(path = "resources/pafs", pattern = paste0("_", myo_meta$Abbr[j]))
      )))

      spec1_query <- working_paf1 %>%
        dplyr::select(
          qname, qstart, qend, strand,
          nmatch, alen, mapq, id
        )

      spec1_target <- working_paf2 %>%
        dplyr::select(
          tname, tstart, tend,
          nmatch, alen, mapq, id
        )
      
    }
  }
}