In [37]:
# Load necessary libraries
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(tidyverse))

In [38]:
# Set paths
orf_seq_file <- '../db/orf_coding_all_R64-1-1_20110203.fasta'

# Check if directory exists, if not create it
if (!dir.exists("../tables")) {
  dir.create("../tables")
}

In [39]:
# Read in the orf sequences
orfseqs <- readDNAStringSet(orf_seq_file)

# Read annotated rna-seq and ribosome profiling data from Weinberg 2016
weinberg2016data <- read_csv('../weinberg2016/GSE53313_Cerevisiae_RNA_RPF_annotated.csv') %>% rename('id'=`...1`)

[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m4213[39m [1mColumns: [22m[34m6[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): ...1, note, gene
[32mdbl[39m (3): mrna, rpf, te

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [42]:
# Write a 48nt fragment for primer design
yeastseqs <- file('../tables/yeastorffrags.csv', 'w')
genecount <- 0

In [43]:
genenames <- names(orfseqs) %>%
    as_tibble() %>%
    mutate(name = str_extract(value, '[^\\ ]+')) %>%
    dplyr::pull(name)

names(orfseqs) <- genenames

In [44]:
for (i in seq(1, nrow(weinberg2016data), by = 2)) {
  gene <- weinberg2016data$id[i]
  if (width(orfseqs[gene]) < 300) next
  frag <- subseq(orfseqs[gene], start = 253, end = 300)
  # Check for stop codons
  for (n in seq(1, nchar(frag), 3)) {
    codon = subseq(frag, start=n, end=n+2)
    if (any(str_detect(as.character(codon), 'TAA|TAG|TGA'))) {
    stop('Stop codon in fragment. Should not be here.')
  }
  }
  cat(as.character(frag), file = yeastseqs, sep = "\n")
  genecount <- genecount + 1
  if (genecount >= 1904) break
}

In [45]:
close(yeastseqs)