In [None]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/exploratory/species_peaks")

library(scales)
library(dplyr)
library(ggplot2)
library(data.table)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

In [7]:
## Plot number of TEs per TE class in peaks

peak_files <- list.files(path = "results/data", 
                         pattern = "TEs.tsv", 
                         full.names = TRUE)

pdf("results/figures/peaks_TE_classes.pdf", width = 8, height = 6)

lapply(seq_along(myo_meta$Abbr), function(i) {

  peak_features <- fread(
    peak_files[grep(myo_meta$Abbr[i], peak_files)], 
    data.table = FALSE
  )

  peak_features$Type <- peak_features$V11
  peak_features$Type[grep("^DNA", peak_features$Type)] <- "DNA"
  peak_features$Type[grep("^SINE", peak_features$Type)] <- "SINE"
  peak_features$Type[grep("^LINE", peak_features$Type)] <- "LINE"
  peak_features$Type[grep("^LTR", peak_features$Type)] <- "LTR"
  peak_features$Type[grep("^Simple", peak_features$Type)] <- "Simple repeat"
  peak_features$Type[grep("Unknown", peak_features$Type)] <- "Unknown"
  peak_features$Type[grep("RNA", peak_features$Type)] <- "rRNA/tRNA/snRNA"
  peak_features$Type[grep("^Sat", peak_features$Type)] <- "Other"
  peak_features$Type[grep("^Retro", peak_features$Type)] <- "Other"
  peak_features$Type <- gsub("_", " ", peak_features$Type)
  
  peak_type <- peak_features %>%
    dplyr::filter(Type != ".") %>%
    dplyr::group_by(V4) %>%
    dplyr::reframe(
      Type = unique(Type)
    ) 
  
  x_order <- peak_type %>%
    dplyr::group_by(Type) %>%
    dplyr::reframe(n = n()) %>%
    dplyr::arrange(desc(n))
    
  peak_type$Type <- factor(peak_type$Type, levels = x_order$Type) 
  
  print(
    ggplot(peak_type, aes(x = Type)) +
    geom_bar() +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_blank(),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = myo_meta$Field_Name[i], 
         subtitle = "Transposable elements in accessible regions") +
    scale_y_continuous(labels = comma) 
  )
  
  ## Within DNA subtypes:
  peak_type <- peak_features %>%
    dplyr::filter(grepl("DNA", V11)) %>%
    dplyr::group_by(V4) %>%
    dplyr::reframe(
      Type = unique(V11)
    ) 
  
  x_order <- peak_type %>%
    dplyr::group_by(Type) %>%
    dplyr::reframe(n = n()) %>%
    dplyr::arrange(desc(n))
    
  peak_type$Type <- factor(peak_type$Type, levels = x_order$Type) 
  
  print(
    ggplot(peak_type, aes(x = Type)) +
    geom_bar() +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_blank(),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = myo_meta$Field_Name[i], 
         subtitle = "DNA transposable elements in accessible regions") +
    scale_y_continuous(labels = comma)
  )
  
})

dev.off()

## Plot proportion of peaks in TEs

frac_TEs_list <- lapply(seq_along(myo_meta$Abbr), function(i) {
  peak_features <- fread(
    peak_files[grep(myo_meta$Abbr[i], peak_files)], data.table = FALSE
  )
  uniq_peaks <- peak_features %>%
    dplyr::mutate(TE = ifelse(V10 == ".", FALSE, TRUE)) %>%
    dplyr::group_by(V4) %>%
    dplyr::reframe(TE = unique(TE))
    
  return(sum(uniq_peaks$TE == TRUE) / nrow(uniq_peaks))
})

df <- data.frame(Species = myo_meta$Field_Name, 
                 Proportion = do.call(rbind, frac_TEs_list), 
                 row.names = NULL) %>%
  dplyr::arrange(desc(Proportion))

df$Species <- factor(df$Species, levels = df$Species)

pdf("results/figures/peaks_TE_proportion.pdf", width = 8, height = 6)

print(
  ggplot(df, aes(x = Species, y = Proportion)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = "Proportion of peaks in transposable elements") +
    ylab("Proportion")
)

dev.off()

In [10]:
## Plot divergence of TEs in peaks vs. not in peaks

feature_files <- list.files(path = "results/data", 
                            pattern = "TEs_peaks.tsv", 
                            full.names = TRUE)

pdf("results/figures/TE_divergence_vs_peak_status.pdf", width = 6, height = 7)

lapply(seq_along(myo_meta$Abbr), function(i) {
  
  features_peaks <- fread(
    feature_files[grep(myo_meta$Abbr[i], feature_files)], 
    data.table = FALSE
  )
  
  df <- features_peaks %>%
    dplyr::mutate(ID = paste(V1, V2, V3, V7)) %>%
    dplyr::group_by(ID) %>%
    dplyr::reframe(
      Peak_ID = paste(unique(V11), collapse = ", "),
      Type = unique(V7),
      Repeat_Name = unique(V6),
      Divergence = unique(V5) 
    )
    
  df$Peak <- "No peak"
  df$Peak[df$Peak_ID != "."] <- "Peak"
  df$Peak <- factor(df$Peak, levels = c("Peak", "No peak"))
    
  print(
    ggplot(df, aes(x = Peak, y = Divergence)) +
      geom_violin(color = "grey", fill = "grey") +
      geom_boxplot(notch = TRUE, width = .3, outlier.shape = NA) +
      theme_minimal() +
      theme(axis.title.x = element_text(margin = margin(t = 10)),
            axis.text.x = element_text(),
            axis.title.y = element_text(margin = margin(r = 15)),
            plot.margin = unit(c(1, 1, 1, 1), "cm")) +
      labs(title = myo_meta$Field_Name[i], 
            subtitle = "Transposable element divergence") +
      xlab("TE status") + ylab("Divergence") +
      scale_y_continuous(labels = comma) 
  )
  
})
       
dev.off()