In [180]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/exploratory/species_peaks")

library(dplyr)
library(scales)
library(ggplot2)
library(data.table)
library(GenomicRanges)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

In [138]:
##  TEs are in accessible vs. inaccessible regions?

frac_list <- lapply(seq_along(myo_meta$Abbr), function(i) {

  ## Load in TE peak data
  features_peaks <- fread(
    file.path("results/data", paste0(myo_meta$Abbr[i], "_TEs_peaks.tsv")), 
    data.table = FALSE
  )
  
  features_peaks$Type <- features_peaks$V7
  features_peaks$Type[grep("^DNA", features_peaks$Type)] <- "DNA"
  features_peaks$Type[grep("^SINE", features_peaks$Type)] <- "SINE"
  features_peaks$Type[grep("^LINE", features_peaks$Type)] <- "LINE"
  features_peaks$Type[grep("^LTR", features_peaks$Type)] <- "LTR"
  features_peaks$Type[grep("^Simple", features_peaks$Type)] <- "Simple repeat"
  features_peaks$Type[grep("Unknown", features_peaks$Type)] <- "Unknown"
  features_peaks$Type[grep("RNA", features_peaks$Type)] <- "rRNA/tRNA/snRNA"
  features_peaks$Type[grep("^Sat", features_peaks$Type)] <- "Other"
  features_peaks$Type[grep("^Retro", features_peaks$Type)] <- "Other"
  features_peaks$Type <- gsub("_", " ", features_peaks$Type)
                          
  ## Which TEs are occupied by peaks?
  df_all <- features_peaks %>%
    dplyr::group_by(ID = paste(V1, V2, V3)) %>% 
    dplyr::reframe(
      Peak_ID = paste(unique(V11), collapse = ", "),
      Type = paste(unique(Type), collapse = ", "),
    ) %>%
    dplyr::mutate(
      Peak = ifelse(Peak_ID == ".", FALSE, TRUE),
      Type = ifelse(grepl(",", Type), "Unknown", Type)
    ) %>%
    dplyr::group_by(Type, Peak) %>%
    dplyr::reframe(No.Elements = n())

    
  df_DNA <- features_peaks %>%
    dplyr::filter(grepl("DNA", V7)) %>%
    dplyr::group_by(ID = paste(V1, V2, V3)) %>% 
    dplyr::reframe(
      Peak_ID = paste(unique(V11), collapse = ", "),
      Type = paste(
        gsub("DNA/", "", unique(V7), fixed = TRUE), 
        collapse = ", "
      )
    ) %>%
    dplyr::mutate(
      Peak = ifelse(Peak_ID == ".", FALSE, TRUE),
      Type = ifelse(grepl(",", Type), "Unknown", Type)
    ) %>%
    dplyr::group_by(Type, Peak) %>%
    dplyr::reframe(No.Elements = n())
    
  return(list(All = df_all, DNA = df_DNA))
  
})

In [179]:
## Proportion of TEs with peaks per species

pdf("results/figures/TE_peak_proportion.pdf", width = 12, height = 9)

## Summarize over all TE types:

df_all <- do.call(rbind, lapply(1:length(frac_list), function(i) {
  df <- frac_list[[i]][[1]]
  return(data.frame(Species = myo_meta$Field_Name[i], df))
}))
df_all <- df_all %>%
  dplyr::group_by(Species) %>%
  dplyr::mutate(
    Frac = No.Elements/sum(No.Elements)
  )
  
x_order <- df_all %>%
  dplyr::filter(Peak == TRUE) %>%
  dplyr::group_by(Species) %>%
  dplyr::reframe(n = sum(Frac)) %>%
  dplyr::arrange(desc(n))
  
df_all$Species <- factor(df_all$Species, levels = x_order$Species)

print(
  ggplot(df_all, aes(x = Species, y = Frac, fill = Peak)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          plot.margin = unit(c(4, 6, 4, 6), "cm")) +
    labs(title = "Proportion of transposable elements with peaks") +
    ylab("Proportion")
)

## Stratify by TE type:

df_all <- df_all %>%
  dplyr::group_by(Species, Type) %>%
  dplyr::mutate(
    Frac = No.Elements/sum(No.Elements)
  )

print(
  ggplot(df_all, aes(x = Type, y = Frac, fill = Peak)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          panel.grid = element_blank(),
          plot.margin = unit(c(1, 3, 1, 3), "cm")) +
    labs(title = "Proportion of transposable elements with peaks") +
    ylab("Proportion") +
    facet_wrap(. ~ Species)
)

print(
  ggplot(df_all, aes(x = Type, y = No.Elements, fill = Peak)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          panel.grid = element_blank(),
          plot.margin = unit(c(1, 3, 1, 3), "cm")) +
    labs(title = "# of transposable elements with peaks") +
    ylab("# elements") + scale_y_continuous(labels = comma) +
    facet_wrap(. ~ Species)
)

## Subset to DNA elements:

df_DNA <- do.call(rbind, lapply(1:length(frac_list), function(i) {
  df <- frac_list[[i]][[2]]
  return(data.frame(Species = myo_meta$Field_Name[i], df))
}))
df_DNA <- df_DNA %>%
  dplyr::group_by(Species, Type) %>%
  dplyr::mutate(
    Frac = No.Elements/sum(No.Elements)
  )
  
print(
  ggplot(df_DNA, aes(x = Type, y = Frac, fill = Peak)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
          axis.title.y = element_text(margin = margin(r = 15)),
          panel.grid = element_blank(),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = "Proportion of transposable elements with peaks") +
    ylab("Proportion") +
    facet_wrap(. ~ Species, scales = "free_x")
)

print(
  ggplot(df_DNA, aes(x = Type, y = No.Elements, fill = Peak)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
          axis.title.y = element_text(margin = margin(r = 15)),
          panel.grid = element_blank(),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = "# transposable elements with peaks") +
    ylab("# elements") + scale_y_continuous(labels = comma) +
    facet_wrap(. ~ Species, scales = "free_x")
)

dev.off()

In [None]:
## Plot TE divergence in accessible vs. nonaccessible regions:

feature_files <- list.files(path = "results/data", 
                            pattern = "TEs_peaks.tsv", 
                            full.names = TRUE)

pdf("results/figures/peak_status_vs_TE_divergence.pdf", width = 6, height = 7)

lapply(seq_along(myo_meta$Abbr), function(i) {
  
  ## Load in feature data with intersecting peaks (file contains all features for a given species, irrespective of whether they intersected a peak)
  features_peaks <- fread(
    feature_files[grep(myo_meta$Abbr[i], feature_files)], 
    data.table = FALSE
  )
  
  ## Group by feature, and collapse all peaks associated with that feature
  df <- features_peaks %>%
    dplyr::mutate(ID = paste(V1, V2, V3, V7)) %>% 
    dplyr::group_by(ID) %>%
    dplyr::reframe(
      Peak_ID = paste(unique(V11), collapse = ", "),
      Type = unique(V7),
      Repeat_Name = unique(V6),
      Divergence = unique(V5) 
    )
    
  df$Peak <- "No peak"
  df$Peak[df$Peak_ID != "."] <- "Peak"
  df$Peak <- factor(df$Peak, levels = c("Peak", "No peak"))
    
  print(
    ggplot(df, aes(x = Peak, y = Divergence)) +
      geom_violin(color = "grey", fill = "grey") +
      geom_boxplot(notch = TRUE, width = .3, outlier.shape = NA) +
      theme_minimal() +
      theme(axis.title.x = element_text(margin = margin(t = 10)),
            axis.text.x = element_text(),
            axis.title.y = element_text(margin = margin(r = 15)),
            plot.margin = unit(c(1, 1, 1, 1), "cm")) +
      labs(title = myo_meta$Field_Name[i], 
            subtitle = "Transposable element divergence") +
      xlab("TE status") + ylab("Divergence") +
      scale_y_continuous(labels = comma) 
  )
  
  print(
    ggplot(subset(df, grepl("DNA", Type)), aes(x = Peak, y = Divergence)) +
      geom_violin(color = "grey", fill = "grey") +
      geom_boxplot(notch = TRUE, width = .3, outlier.shape = NA) +
      theme_minimal() +
      theme(axis.title.x = element_text(margin = margin(t = 10)),
            axis.text.x = element_text(),
            axis.title.y = element_text(margin = margin(r = 15)),
            plot.margin = unit(c(1, 1, 1, 1), "cm")) +
      labs(title = myo_meta$Field_Name[i], 
            subtitle = "DNA transposable element divergence") +
      xlab("TE status") + ylab("Divergence") +
      scale_y_continuous(labels = comma) 
  )
  
})
       
dev.off()

In [197]:
## Plot proportion of peaks with TEs

frac_peaks_list <- lapply(seq_along(myo_meta$Abbr), function(i) {
  peak_features <- fread(
    file.path("results/data", paste0(myo_meta$Abbr[i], "_peaks_TEs.tsv")),
    data.table = FALSE
  )
  uniq_peaks <- peak_features %>%
    dplyr::mutate(TE = ifelse(V10 == ".", FALSE, TRUE)) %>%
    dplyr::group_by(V4) %>%
    dplyr::reframe(TE = unique(TE)) %>%
    dplyr::group_by(TE) %>%
    dplyr::reframe(No.Peaks = n()) 
    
  return(
    data.frame(Species = myo_meta$Field_Name[i],
               uniq_peaks, row.names = NULL)
  )
})

df <- do.call(rbind, frac_peaks_list) %>%
  dplyr::group_by(Species) %>%
  dplyr::mutate(Frac_Peaks = No.Peaks/sum(No.Peaks))

x_order <- df %>%
  dplyr::filter(TE == TRUE) %>%
  dplyr::group_by(Species) %>%
  dplyr::arrange(desc(Frac_Peaks))

df$Species <- factor(df$Species, levels = x_order$Species)

pdf("results/figures/peak_TE_proportion.pdf", width = 8, height = 6)

print(
  ggplot(df, aes(x = Species, y = Frac_Peaks, fill = TE)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = "Proportion of peaks in transposable elements") +
    ylab("Proportion")
)

dev.off()

In [None]:
## Plot proportion of peaks with TEs

frac_peaks_list <- lapply(seq_along(myo_meta$Abbr), function(i) {
  peak_features <- fread(
    file.path("results/data", paste0(myo_meta$Abbr[i], "_peaks_TEs.tsv")),
    data.table = FALSE
  )
  uniq_peaks <- peak_features %>%
    dplyr::mutate(TE = ifelse(V10 == ".", FALSE, TRUE)) %>%
    dplyr::group_by(V4) %>%
    dplyr::reframe(TE = unique(TE)) %>%
    dplyr::group_by(TE) %>%
    dplyr::reframe(No.Peaks = n()) 
    
  return(
    data.frame(Species = myo_meta$Field_Name[i],
               uniq_peaks, row.names = NULL)
  )
})

df <- do.call(rbind, frac_peaks_list) %>%
  dplyr::group_by(Species) %>%
  dplyr::mutate(Frac_Peaks = No.Peaks/sum(No.Peaks))
  
x_order <- df %>%
  dplyr::filter(TE == TRUE) %>%
  dplyr::group_by(Species) %>%
  dplyr::arrange(desc(No.Peaks))

df$Species <- factor(df$Species, levels = x_order$Species)

pdf("results/figures/peaks_TE_proportion.pdf", width = 8, height = 6)

print(
  ggplot(df, aes(x = Species, y = Frac, fill = TE)) +
    geom_bar(stat = "identity") +
    theme_minimal() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1),
          axis.title.y = element_text(margin = margin(r = 15)),
          plot.margin = unit(c(1, 1, 1, 1), "cm")) +
    labs(title = "Proportion of peaks in transposable elements") +
    ylab("Proportion")
)

dev.off()

In [None]:
# ## Plot fraction of genome comprised of TEs vs. TEs covered by peaks

# frac_TEs_list <- lapply(seq_along(myo_meta$Abbr), function(i) {

#   ## Load in TE GTF
#   spec_TEs <- fread(file.path("/Users/rebecca/sudmant/analyses/myotis/data/genomes/bat_genomes/repeatMasker", 
#                               paste0(myo_meta$Abbr[i], "1_repetMasker.gff3")), skip = 1, data.table = FALSE)

#   ## Load in chromosome lengths
#   chromsize <- fread(paste0("/Users/rebecca/sudmant/analyses/myotis/data/", 
#                             myo_meta$Abbr[i], "_chromsizes"), 
#                      data.table = FALSE)

#   ## Load in TE peak data
#   features_peaks <- fread(
#     file.path("results/data", paste0(myo_meta$Abbr[i], "_TEs_peaks.tsv")), 
#     data.table = FALSE
#   )
                          
#   ## Get fraction of genome occupied by TEs
#   spec_TEs <- spec_TEs[,c(1, 4:5, 7)] 
#   colnames(spec_TEs) <- c("seqnames", "start", "end", "strand")
#   bps_TEs <- sum(width(GenomicRanges::reduce(
#     makeGRangesFromDataFrame(spec_TEs, ignore.strand = TRUE)
#   )))
#   frac_genome <- bps_TEs/sum(chromsize[,2])

#   ## Get fraction of TEs occupied by peaks:
#   features_peaks <- features_peaks %>%
#     dplyr::group_by(ID = paste(V1, V2, V3)) %>% 
#     dplyr::reframe(
#       Peak_ID = paste(unique(V11), collapse = ", ")
#     ) %>%
#     dplyr::mutate(
#       Peak = ifelse(Peak_ID == ".", FALSE, TRUE)
#     )
#   frac_peaks <- sum(features_peaks$Peak)/nrow(features_peaks)       

#   return(data.frame(Species = myo_meta$Abbr[i], 
#                     Frac_Genome = frac_genome,
#                     Frac_Peaks = frac_peaks))
  
# }) 

# df <- do.call(rbind, frac_TEs_list)

# pdf("results/figures/TE_peak_proportion.pdf")

# print(
#   ggplot(df, aes(x = Frac_Genome, y = Frac_Peaks)) +
#     geom_text(aes(label = Species)) +
#     theme_minimal() +
#     theme(axis.title.x = element_text(margin = margin(t = 15)),
#           axis.text.x = element_text(),
#           axis.title.y = element_text(margin = margin(r = 15)),
#           plot.margin = unit(c(1, 1, 1, 1), "cm")) +
#     labs(title = "Transposable elements") +
#     xlab("Fraction genome with TEs") + ylab("Fraction TEs with peaks") +
#     scale_x_continuous(limits = c(min(df$Frac_Genome - .001), max(df$Frac_Genome + .001)))
# )

# dev.off()