In [1]:
setwd("/Users/rebecca/sudmant/analyses/myotis/analysis/exploratory/species_peaks")

library(dplyr)
library(data.table)

myo_meta <- read.csv("/Users/rebecca/sudmant/analyses/myotis/data/myotis_meta.csv")


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last




In [2]:
ortho_table <- fread("/Users/rebecca/sudmant/analyses/myotis/data/ortholog_mapping.csv", data.table = FALSE)
yum_col <- grep("^yum", colnames(ortho_table))
ortho_table[,yum_col] <- gsub("SCAF", "SUPER", ortho_table[,yum_col]) 
colnames(ortho_table)[1:3] <- c("ENSEMBL", "SYMBOL", "Myotis_Alias")

In [3]:
## Intersect gene peaks with orthologous genes for each species

for (i in seq_along(myo_meta$Abbr)) {
  
  spec_id <- tolower(gsub("Myotis-", "", myo_meta$Field_Name[i]))
  
  file_path <- list.files(path = "results/data", 
                          pattern = paste0(myo_meta$Abbr[i], "_genes_5000.*_peaks.tsv"),
                          full.names = TRUE)
  peaks <- fread(file_path, data.table = FALSE)
  peaks <- peaks[peaks$V3 == "gene",] ## Subset to gene row for a given feature
  colnames(peaks)[9] <- "Gene"
  ## Parse gene names
  peaks$Gene <- sapply(strsplit(peaks$Gene, ";", fixed = TRUE), "[", 1)                                        
  peaks$Gene <- gsub(".*=", paste0(spec_id, "_"), peaks$Gene) 
  peaks <- peaks %>%
    dplyr::group_by(Gene) %>%
    dplyr::reframe(
      Peak_ID = paste(unique(V13), collapse = ", ")
    )
    
  ## Annotate peak genes with orthologous genes
  spec_col <- grep(paste0(spec_id, "_ID"), colnames(ortho_table))
  spec_ortho_table <- ortho_table[,c(1:3, spec_col),]
  colnames(spec_ortho_table)[4] <- "Gene"
  spec_ortho_table <- spec_ortho_table[spec_ortho_table$Gene %in% peaks$Gene,]
  peak_orthos <- merge(peaks, spec_ortho_table, by = "Gene", all.x = TRUE, sort = FALSE)
  
  ## Include peak status in the output
  peak_orthos$Peak <- TRUE
  peak_orthos$Peak[peak_orthos$Peak_ID == "."] <- FALSE
  peak_orthos <- peak_orthos %>% dplyr::select(-Peak_ID)
  
  fwrite(peak_orthos, file = gsub(".tsv", "_orthologs.csv", file_path))
  
}

In [7]:
## Intersect exon peaks with orthologous genes for each species

for (i in seq_along(myo_meta$Abbr)) {
  
  spec_id <- tolower(gsub("Myotis-", "", myo_meta$Field_Name[i]))
  
  file_path <- list.files(path = "results/data", 
                          pattern = paste0(myo_meta$Abbr[i], "_genes_exon1.*_peaks.tsv"),
                          full.names = TRUE)
  peaks <- fread(file_path, data.table = FALSE)
  colnames(peaks)[9] <- "Gene"
  ## Parse gene names
  peaks$Gene <- sapply(strsplit(peaks$Gene, ";"), "[", 2)
  peaks$Gene <- gsub("model", "TU", peaks$Gene)
  peaks$Gene <- gsub(".*=", paste0(spec_id, "_"), peaks$Gene)
  peaks <- peaks %>%
    dplyr::group_by(Gene) %>%
    dplyr::reframe(
      Peak_ID = paste(unique(V13), collapse = ", ")
    )
    
  ## Annotate peak genes with orthologous genes
  spec_col <- grep(paste0(spec_id, "_ID"), colnames(ortho_table))
  spec_ortho_table <- ortho_table[,c(1:3, spec_col),]
  colnames(spec_ortho_table)[4] <- "Gene"
  spec_ortho_table <- spec_ortho_table[spec_ortho_table$Gene %in% peaks$Gene,]
  peak_orthos <- merge(peaks, spec_ortho_table, by = "Gene", all.x = TRUE, sort = FALSE)
  
  ## Include peak status in the output
  peak_orthos$Peak <- TRUE
  peak_orthos$Peak[peak_orthos$Peak_ID == "."] <- FALSE
  peak_orthos <- peak_orthos %>% dplyr::select(-Peak_ID)
  
  fwrite(peak_orthos, file = gsub(".tsv", "_orthologs.csv", file_path))
  
}