In [2]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/exploratory/species_individual_peaks")

library(dplyr)
library(scales)
library(ggplot2)
library(data.table)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

## Get differential peak status between species in orthologous genes present in ALL species


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last




In [24]:
## Get # of genes with peaks in all species:

peak_ortho_files <- list.files(path = "results/data", 
                               pattern = "genes_5000.*orthologs",
                               full.names = TRUE)

peak_gene_list <- lapply(seq_along(myo_meta$Abbr), function(i) {
  file_paths <- peak_ortho_files[grep(myo_meta$Abbr[i], peak_ortho_files)]
  indv_list <- lapply(seq_along(file_paths), function(j) {
    indv_peak_ortho <- fread(file_paths[j], data.table = FALSE)
    indv_id <- sapply(strsplit(file_paths[j], "_"), "[", 2)
    df <- indv_peak_ortho %>%
      dplyr::filter(Myotis_Alias != "") %>%
      dplyr::group_by(Myotis_Alias) %>%
      dplyr::reframe(Peak = paste(unique(Peak), collapse = ", ")) %>%
      dplyr::mutate(
        Peak = ifelse(grepl(",", Peak), TRUE, Peak),
        Species = myo_meta$Field_Name[i],
        Individual = indv_id
      )
    return(df)
  })
  return(do.call(rbind, indv_list))
})
df <- do.call(rbind, peak_gene_list)

df <- df %>%
  dplyr::group_by(Species, Individual, Peak) %>%
  dplyr::reframe(
    No.Genes = n()
  )
  
x_order <- df %>%
  dplyr::filter(Peak == TRUE) %>%
  dplyr::group_by(Species) %>%
  dplyr::reframe(n = sum(No.Genes)) %>%
  dplyr::arrange(desc(n))
  
df$Species <- factor(df$Species, levels = x_order$Species)

ggplot(df, aes(x = Individual, y = No.Genes, fill = Peak)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title.y = element_text(margin = margin(r = 15)),
        panel.grid = element_blank(),
        plot.margin = unit(c(1, 1, 1, 1), "cm")) +
  labs(title = "Orthologous genes") +
  xlab("Individual") + ylab("# genes") +
  scale_y_continuous(labels = comma) +
  facet_wrap(. ~ Species, scales = "free_x")

In [28]:
## Get genes shared between all species:

gene_list <- lapply(seq_along(peak_ortho_files), function(i) {
  peak_ortho <- fread(peak_ortho_files[i], data.table = FALSE)
  return(unique(peak_ortho$Myotis_Alias[peak_ortho$Myotis_Alias != ""]))
})

shared_genes <- Reduce(intersect, gene_list)
length(shared_genes)

In [37]:
## Get peak status of shared genes:

gene_peak_list <- lapply(seq_along(peak_ortho_files), function(i) {
  peak_ortho <- fread(peak_ortho_files[i], data.table = FALSE)
  peak_ortho <- peak_ortho %>%
    dplyr::filter(Myotis_Alias %in% shared_genes) %>%
    dplyr::group_by(Myotis_Alias) %>%
    dplyr::reframe(Peak = ifelse(
      sum(Peak) > 0, TRUE, FALSE
    ))
  return(unique(peak_ortho$Myotis_Alias[peak_ortho$Peak == TRUE]))
})
names(gene_peak_list) <- sapply(strsplit(sapply(strsplit(peak_ortho_files, "/"), "[", 3), "_"), function(x) paste(x[1:2], collapse = " "))

spec_peak_status <- as.data.frame.matrix(table(stack(gene_peak_list)))

In [38]:
head(spec_peak_status)

Unnamed: 0_level_0,mMyoAui B2S1-17,mMyoAui B2S1-19,mMyoAui B2S1-20,mMyoCai B5S4-6,mMyoCai MyoCal2,mMyoCai W8-MyCa,mMyoEvo B1S1-2,mMyoEvo B1S1-3,mMyoEvo B1S5-15,mMyoLuc B4S3-7,...,mMyoThy B2S1-16,mMyoVel B2S1-4,mMyoVel B2S1-5,mMyoVel B3S1-13,mMyoVol B1S3-11,mMyoVol B2S1-21,mMyoVol B2S1-22,mMyoYum B3S1-19,mMyoYum MY2-SW,mMyoYum W10-MyYu
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,...,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
gene_100,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
gene_10002,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
gene_10003,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
gene_10004,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
gene_10006,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
gene_10007,1,1,1,1,0,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1


In [69]:
## Cluster species by their peak status:

library(RColorBrewer)
library(ComplexHeatmap)

cor_mat <- cor(spec_peak_status)
diag(cor_mat) <- NA

n_genes <- nrow(spec_peak_status)

plot_title <- paste("Peak status over", comma(n_genes), "shared orthologous genes")

meta_df <- data.frame(
  Species = sapply(strsplit(colnames(cor_mat), " "), "[", 1)
)
rownames(meta_df) <- colnames(cor_mat)                                                      
                                                       
colors <- brewer.pal(n_distinct(meta_df$Species), "Paired")
names(colors) <- unique(meta_df$Species)
meta_cols <- list(Species = colors)
                            
col_anno <- HeatmapAnnotation(df = meta_df, col = meta_cols)  # Row annotation
row_anno <- HeatmapAnnotation(df = meta_df, col = meta_cols, which = "row", show_legend = FALSE, show_annotation_name = FALSE)  # Row annotation

In [79]:
pdf(file = paste0("results/figures/gene_peak_status_correlation_heatmap_v2.pdf"), width = 12, height = 9)

print(
  ComplexHeatmap::Heatmap(name = "Peak status correlation", 
                          
                          column_title = plot_title,
                          matrix = cor_mat, 
                          clustering_method_rows = "average",
                          clustering_method_columns = "average",
                          top_annotation = col_anno,
                          left_annotation = row_anno)
)

dev.off()