In [1]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/reciprocal_best_hits/contextualize_best_hits")

library(dplyr, verbose = FALSE)
library(tidyr)
library(data.table)

myo_meta <- read.csv("../../../data/myotis_meta.csv")


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last




In [2]:
## How do the genes with differential peaks compare?

i = 1
# for (i in seq_along(myo_meta$Abbr)) {

  print(myo_meta$Field_Name[i])
  
  inds <- seq_along(myo_meta$Abbr)

  spec2_data <- suppressMessages(lapply(inds[inds != i], function(j) {

    ## Load in peak data for species 1: 
    ## peaks in reciprocal regions that intersect orthologous genes
    peaks1_genes <- fread(
      file.path("resources/genes/peaks", 
      list.files(path = "resources/genes/peaks", pattern = paste0(myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], ".*_only.csv"))),
      data.table = FALSE
    )
    peaks1_genes <- peaks1_genes %>%
    dplyr::group_by(myotis_ali_fn) %>%
    dplyr::slice_max(as.numeric(Intersect_Start)) %>%
    dplyr::select(myotis_ali_fn, Peak_ID)

    ## Load in gene data for species 1: 
    ## Species 1 orthologous genes in reciprocal best hit regions
    hits1_genes <- fread(
      file.path("resources/genes", 
      list.files(path = "resources/genes", pattern = paste0(myo_meta$Abbr[i], "_vs_", myo_meta$Abbr[j], ".*_only.csv"))),
      data.table = FALSE
    )
    hits1_genes <- hits1_genes %>%
      dplyr::mutate(Reciprocal_Seq = paste(Chr, Reciprocal_Start, Reciprocal_End)) %>%
      dplyr::group_by(Chr, myotis_ali_fn, Reciprocal_Seq) %>%
      dplyr::summarise(human_gene_name = unique(human_gene_name))

    ## Load in peak data for species 2: 
    ## peaks in reciprocal regions that intersect orthologous genes
    peaks2_genes <- fread(
      file.path("resources/genes/peaks", 
      list.files(path = "resources/genes/peaks", pattern = paste0(myo_meta$Abbr[j], "_vs_", myo_meta$Abbr[i], ".*_only.csv"))),
      data.table = FALSE
    )
    peaks2_genes <- peaks2_genes %>%
      dplyr::group_by(myotis_ali_fn) %>%
      dplyr::slice_max(as.numeric(Intersect_Start)) %>%
      dplyr::select(Peak_ID)

    ## Load in gene data for species 2: 
    ## Species 1 orthologous genes in reciprocal best hit regions
    hits2_genes <- fread(
      file.path("resources/genes", 
      list.files(path = "resources/genes", pattern = paste0(myo_meta$Abbr[j], "_vs_", myo_meta$Abbr[i], ".*_only.csv"))),
      data.table = FALSE
    )
    hits2_genes <- hits2_genes %>%
      dplyr::mutate(Reciprocal_Seq = paste(Chr, Reciprocal_Start, Reciprocal_End)) %>%
      dplyr::group_by(Chr, myotis_ali_fn, Reciprocal_Seq) %>%
      dplyr::summarise(human_gene_name = unique(human_gene_name))

    ## Genes not present in peak file implies no peak at that location:
    hits1_status <- merge(hits1_genes, peaks1_genes, by = "myotis_ali_fn", all.x = TRUE)
    hits1_status$Peak <- TRUE
    hits1_status$Peak[is.na(hits1_status$Peak_ID)] <- FALSE
    hits2_status <- merge(hits2_genes, peaks2_genes, by = "myotis_ali_fn", all.x = TRUE)
    hits2_status$Peak <- TRUE
    hits2_status$Peak[is.na(hits2_status$Peak_ID)] <- FALSE

    ## Subset to orthologous genes present in both species:
    hits1_status <- hits1_status[hits1_status$myotis_ali_fn %in% hits2_status$myotis_ali_fn,]
    hits2_status <- hits2_status[match(hits1_status$myotis_ali_fn, hits2_status$myotis_ali_fn),]
    if (!identical(hits2_status$myotis_ali_fn, hits1_status$myotis_ali_fn)) {
      print(paste(i, j))
      stop("Genes do not match between species")
    }

    ## What genes are differentially accessible?
    return(unique(hits1_status$human_gene_name))
    
  }))

names(spec2_data) <- myo_meta$Abbr[inds[inds != i]]

[1] "Myotis-Auriculus"


In [None]:
# jaccard_index <- function(x, y) {
  
#   intrsct <- length(intersect(x, y))
#   union <- length(x) + length(y) - intrsct
#   return(intrsct/union)
  
# }
# jac_idx <- lapply(list, function(x) unlist(lapply(list, function(y){jaccard_index(x, y)*100})))
# mat <- do.call(cbind, jac_idx)

In [15]:
head(table(stack(spec2_data)))

         ind
values    mMyoCai mMyoEvo mMyoLuc mMyoOcc mMyoThy mMyoVel mMyoVol mMyoYum
  A1BG          1       0       0       0       1       1       0       0
  A1CF          1       1       0       0       1       1       1       0
  A2M           0       1       0       0       1       1       0       0
  A2ML1         0       1       0       0       1       1       0       0
  A3GALT2       0       1       1       1       1       1       0       1
  A4GALT        0       1       1       0       1       1       0       0

In [None]:
  df <- do.call(rbind, spec2_data)
  df$Species <- gsub("Myotis-", "", df$Species)
  df$Is_Peak_Shared <- factor(df$Is_Peak_Shared, levels = c(TRUE, FALSE))

  pdf(file = paste0("results/", myo_meta$Abbr[i], "_orthologous_gene_differential_peaks.pdf"), width = 7, height = 7)

  spec_order <- df %>%
    dplyr::filter(Is_Peak_Shared == FALSE) %>%
    dplyr::arrange(desc(Frac_Peaks))  

  df$Species <- factor(df$Species, levels = spec_order$Species)

  print(
    ggplot(df, aes(x = Species, y = Frac_Peaks, fill = Is_Peak_Shared)) +
    geom_bar(stat = "identity", color = "black") +
    theme_minimal() +
    theme(plot.subtitle = element_text(margin = margin(b = 10)), 
          axis.title.x = element_blank(),
          axis.text.x = element_text(margin = margin(t = -10, b = 10), size = 11, angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          axis.text.y = element_text(size = 11),
          axis.ticks.y = element_line(color = "black"),
          legend.position = "bottom",
          legend.direction = "horizontal",
          panel.grid = element_blank(),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = myo_meta$Field_Name[i], subtitle = "Peak status in orthologous genes") +
    ylab(paste("Fraction of genes\n(present in both species)")) +
    scale_fill_manual(name = "Is peak conserved between species?", values = brewer.pal(3, "Paired"))
  )
  
  spec_order <- df %>%
    dplyr::filter(Is_Peak_Shared == TRUE) %>%
    dplyr::arrange(desc(No.Peaks))  

  df$Species <- factor(df$Species, levels = spec_order$Species)

  print(
    ggplot(df, aes(x = Species, y = No.Peaks, fill = Is_Peak_Shared)) +
    geom_bar(stat = "identity", position = "dodge", color = "black") +
    theme_minimal() +
    theme(plot.subtitle = element_text(margin = margin(b = 10)), 
          axis.title.x = element_blank(),
          axis.text.x = element_text(margin = margin(t = -10, b = 10), size = 11, angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          axis.text.y = element_text(size = 11),
          axis.ticks.y = element_line(color = "black"),
          legend.position = "bottom",
          legend.direction = "horizontal",
          panel.grid = element_blank(),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = myo_meta$Field_Name[i], subtitle = "Peak status in orthologous genes") +
    ylab(paste("# genes")) +
    scale_y_continuous(labels = comma) +
    scale_fill_manual(name = "Is peak conserved between species?", values = brewer.pal(3, "Paired"))
  )  

  df <- df[df$Is_Peak_Shared == FALSE,]

  print(
    ggplot(df, aes(x = Frac_Peaks, y = Cor)) +
    geom_point() +
    geom_text_repel(aes(label = Species), size = 4) +
    theme_minimal() +
    theme(axis.title.x = element_text(size = 12, margin = margin(t = 15)),
          axis.text.x = element_text(size = 11),
          axis.title.y = element_text(size = 12, margin = margin(r = 15)),
          axis.text.y = element_text(size = 11),
          axis.ticks.y = element_line(color = "black"),
          legend.position = "bottom",
          legend.direction = "horizontal",
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = myo_meta$Field_Name[i], subtitle = "Peak status in orthologous genes") +
    xlab("Fraction of genes with differential peak status") +
    ylab("Correlation between peaks status")
  )

  dev.off()

# }