In [None]:
# Load necessary libraries
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(tidyverse))

In [None]:
setwd('/fh/fast/subramaniam_a/user/kchen/git/chen_2023/analysis/library_design/endogenous_fragments/scripts')

In [None]:
# Set paths
orf_seq_file <- '../db/orf_coding_all_R64-1-1_20110203.fasta'
annotations <- '../db/saccharomyces_cerevisiae_R64-1-1_20110208.gff'

# Check if directory exists, if not create it
if (!dir.exists("../tables")) {
  dir.create("../tables")
}

In [None]:
# Read in the orf sequences
orfseqs <- readDNAStringSet(orf_seq_file)

# Read rna-seq and ribosome profiling data from Weinberg 2016
weinberg2016data <- read_tsv('../weinberg2016/GSE53313_Cerevisiae_RNA_RPF.txt', skip = 3, col_names = c('id', 'mrna', 'rpf', 'te'))

In [None]:
# Read gene names and functions into a dataframe

annotations_data <- import(annotations)

genes <- annotations_data %>% 
  as_tibble() %>%
  dplyr::filter(type == 'gene') %>% 
  dplyr::filter(str_detect(orf_classification, 'Verified')) %>% 
  # mutate(note = URLdecode(URLdecode(Note)))
  mutate(gene = ifelse(!is.na(gene), gene, Name)) %>% 
  dplyr::rename(name=Name, note=Note) %>%
  select(name, note, gene)

In [None]:
# Combine annotations with expression data
weinberg2016data <- left_join(weinberg2016data, genes, by = c("id" = "name")) %>% 
  arrange(desc(rpf))

In [None]:
# Write a 48nt fragment for primer design
yeastseqs <- file('../tables/yeastorffrags.csv', 'w')
genecount <- 0

In [None]:
genenames <- names(orfseqs) %>%
    as_tibble() %>%
    mutate(name = str_extract(value, '[^\\ ]+')) %>%
    dplyr::pull(name)

In [None]:
names(orfseqs) <- genenames

In [None]:
orfseqs

In [None]:
for (i in seq(1, nrow(weinberg2016data), by = 2)) {
  gene <- weinberg2016data$id[i]
  if (width(orfseqs[gene]) < 300) next
  frag <- subseq(orfseqs[gene], start = 253, end = 300)
  # Check for stop codons
  for (n in seq(1, nchar(frag), 3)) {
    codon = subseq(frag, start=n, end=n+2)
    if (any(str_detect(as.character(codon), 'TAA|TAG|TGA'))) {
    stop('Stop codon in fragment. Should not be here.')
  }
  }
  cat(as.character(frag), file = yeastseqs, sep = "\n")
  genecount <- genecount + 1
  if (genecount >= 1904) break
}

In [None]:
close(yeastseqs)