In [74]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/exploratory/species_peaks")

library(dplyr)
library(data.table)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

In [13]:
## Get differential peak status between species in orthologous genes present in ALL species

peak_ortho_files <- list.files(path = "results/data", 
                               pattern = "genes_5000.*orthologs",
                               full.names = TRUE)

## Get list of genes present in all species:

gene_list <- lapply(seq_along(myo_meta$Abbr), function(i) {
  peak_ortho <- fread(peak_ortho_files[grep(myo_meta$Abbr[i], peak_ortho_files)], data.table = FALSE)
  return(unique(peak_ortho$SYMBOL[peak_ortho$SYMBOL != ""]))
})
shared_genes <- Reduce(intersect, gene_list)

## Get peak status of shared genes:

gene_peak_list <- lapply(seq_along(myo_meta$Abbr), function(i) {
  peak_ortho <- fread(peak_ortho_files[grep(myo_meta$Abbr[i], peak_ortho_files)], data.table = FALSE)
  peak_ortho <- peak_ortho %>%
    dplyr::filter(SYMBOL %in% shared_genes) %>%
    dplyr::group_by(SYMBOL) %>%
    dplyr::reframe(Peak = ifelse(
      sum(Peak) > 1, TRUE, FALSE
    ))
  return(unique(peak_ortho$SYMBOL[peak_ortho$Peak == TRUE]))
})
names(gene_peak_list) <- myo_meta$Field_Name

spec_peak_status <- as.data.frame.matrix(table(stack(gene_peak_list)))

In [75]:
## What genes have peaks in ALL species?

shared_peaks <- rownames(spec_peak_status)[rowSums(spec_peak_status) == ncol(spec_peak_status)]
shared_peaks_df <- data.frame(Gene = shared_peaks)
write.csv(shared_peaks_df, file = "results/data/shared_genes_shared_peaks.csv"))

## What genes have peaks in N-1 species?

semi_shared_peaks <- rownames(spec_peak_status)[rowSums(spec_peak_status) == (ncol(spec_peak_status)-1)]

## Which species is missing a peak in these genes?
semi_shared_spec <- unlist(lapply(semi_shared_peaks, function(gene) {
  spec_col <- which(
    as.logical(spec_peak_status[rownames(spec_peak_status) == gene,]) == FALSE
  )
  colnames(spec_peak_status)[spec_col]
}))
names(semi_shared_spec) <- semi_shared_peaks
semi_shared_df <- data.frame(Gene = semi_shared_peaks, 
                             Species = semi_shared_spec, 
                             row.names = NULL)
write.csv(semi_shared_df, file = "results/data/shared_genes_semi_shared_peaks.csv"))

## What genes have private peaks (i.e. peaks in just one species)?

private_peaks <- rownames(spec_peak_status)[rowSums(spec_peak_status) == 1]

## What species is asssociated with each private genes?
private_spec <- unlist(lapply(private_peaks, function(gene) {
  spec_col <- which(
    as.logical(spec_peak_status[rownames(spec_peak_status) == gene,])
  )
  colnames(spec_peak_status)[spec_col]
}))
names(private_spec) <- private_peaks
private_df <- data.frame(Gene = private_peaks, 
                         Species = private_spec, 
                         row.names = NULL)
write.csv(private_df, file = "results/data/shared_genes_private_peaks.csv"))


In [82]:
## Cluster species by their peak status:
private_df <- data.frame(Gene = private_peaks, Species = private_spec)

head(private_df)

Unnamed: 0_level_0,Gene,Species
Unnamed: 0_level_1,<chr>,<chr>
ABCA13,ABCA13,Myotis-Velifer
ADRM1,ADRM1,Myotis-Volans
ALDOA,ALDOA,Myotis-Velifer
ANKRD17,ANKRD17,Myotis-Volans
ARFGEF1,ARFGEF1,Myotis-Californicus
ARG2,ARG2,Myotis-Thysanodes
