In [12]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/exploratory/species_individual_peaks")

library(dplyr)
library(data.table)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")

In [13]:
ortho_table <- fread("/Users/rebecca/sudmant/analyses/myotis/data/ortholog_mapping.csv", data.table = FALSE)
yum_col <- grep("^yum", colnames(ortho_table))
ortho_table[,yum_col] <- gsub("SCAF", "SUPER", ortho_table[,yum_col]) 
colnames(ortho_table)[1:3] <- c("ENSEMBL", "SYMBOL", "Myotis_Alias")

In [14]:
## Intersect gene peaks with orthologous genes for each species

for (i in seq_along(myo_meta$Abbr)) {
  
  spec_id <- tolower(gsub("Myotis-", "", myo_meta$Field_Name[i]))
  
  file_paths <- list.files(path = "results/data", 
                           pattern = paste0(myo_meta$Abbr[i], ".*_genes_5000.*_peaks.tsv"),
                           full.names = TRUE)
  
  lapply(seq_along(file_paths), function(j) {
    
    peaks <- fread(file_paths[j], data.table = FALSE)
    peaks <- peaks[peaks$V3 %in% c("gene", "mRNA"),] ## Subset to gene row for a given feature
    peaks$Gene <- peaks$V9
    ## Parse gene names
    peaks$Gene <- sapply(strsplit(peaks$Gene, ";", fixed = TRUE), "[", 1)                                        
    peaks$Gene <- gsub(".*=", paste0(spec_id, "_"), peaks$Gene) 
    ## Features with ENST IDs need to be parsed differently:
    peaks$Gene[grepl("ENST", peaks$V9)] <- sapply(strsplit(
      peaks$V9[grepl("ENST", peaks$V9)], ";", fixed = TRUE
    ), function(x) {
        gsub(" ", "-", paste(gsub("Name=", "", x[3]), gsub("Parent=", "", x[2])))
      }) 
    peaks <- peaks %>%
      dplyr::group_by(Gene) %>%
      dplyr::reframe(
        Peak_ID = paste(unique(V15), collapse = ", ")
      )
      
    ## Annotate peak genes with orthologous genes
    spec_col <- grep(paste0(spec_id, "_ID"), colnames(ortho_table))
    spec_ortho_table <- ortho_table[,c(1:3, spec_col),]
    colnames(spec_ortho_table)[4] <- "Gene"
    spec_ortho_table$Gene <- unlist(lapply(1:nrow(spec_ortho_table), function(j) {
      ensmbl <- gsub("1$", "0", spec_ortho_table$ENSEMBL[j])
      gsub("-$", "", gsub(ensmbl, "", spec_ortho_table$Gene[j]))
    }))
    spec_ortho_table <- spec_ortho_table[spec_ortho_table$Gene %in% peaks$Gene,]

    peak_orthos <- merge(peaks, spec_ortho_table, by = "Gene", all.x = TRUE, sort = FALSE)

    ## Include peak status in the output
    peak_orthos <- peak_orthos %>%
      dplyr::mutate(
        Peak = ifelse(Peak_ID == ".", FALSE, TRUE)
      ) %>%
      dplyr::select(-Peak_ID)
    
    fwrite(peak_orthos, file = gsub(".tsv", "_orthologs.csv", file_paths[j]))
  
  })
  
}

In [7]:
# ## Intersect exon peaks with orthologous genes for each species

# for (i in seq_along(myo_meta$Abbr)) {
  
#   spec_id <- tolower(gsub("Myotis-", "", myo_meta$Field_Name[i]))
  
#   file_path <- list.files(path = "results/data", 
#                           pattern = paste0(myo_meta$Abbr[i], "_genes_exon1.*_peaks.tsv"),
#                           full.names = TRUE)
#   peaks <- fread(file_path, data.table = FALSE)
#   colnames(peaks)[15] <- "Gene"
#   ## Parse gene names
#   peaks$Gene <- sapply(strsplit(peaks$Gene, ";"), "[", 2)
#   peaks$Gene <- gsub("model", "TU", peaks$Gene)
#   peaks$Gene <- gsub(".*=", paste0(spec_id, "_"), peaks$Gene)
#   peaks <- peaks %>%
#     dplyr::group_by(Gene) %>%
#     dplyr::reframe(
#       Peak_ID = paste(unique(V6), collapse = ", ")
#     )
    
#   ## Annotate peak genes with orthologous genes
#   spec_col <- grep(paste0(spec_id, "_ID"), colnames(ortho_table))
#   spec_ortho_table <- ortho_table[,c(1:3, spec_col),]
#   colnames(spec_ortho_table)[4] <- "Gene"
#   spec_ortho_table <- spec_ortho_table[spec_ortho_table$Gene %in% peaks$Gene,]
#   peak_orthos <- merge(peaks, spec_ortho_table, by = "Gene", all.x = TRUE, sort = FALSE)
  
#   ## Include peak status in the output
#   peak_orthos <- peak_orthos %>%
#     dplyr::mutate(
#       Peak = ifelse(Peak_ID == ".", FALSE, TRUE)
#     ) %>%
#     dplyr::select(-Peak_ID)
  
#   fwrite(peak_orthos, file = gsub(".tsv", "_orthologs.csv", file_path))
  
# }