## Get all variants of each DMS motif

### Load libraries

In [1]:
suppressPackageStartupMessages(library(R.utils))
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(plyranges))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(rasilabRtemplates))

In [2]:
spikein2 <- 'GTTGGCGTTGGATGTCTAAGGGTGAAGAATTGTTCACACCCACAAAAA'
spikein5 <- 'AAGAAGAAGAAGAAGAAATGTCTAAGGGTGAAGAATTGTTCCACCCGC'

### Get command line arguments

In [3]:
args <- R.utils::commandArgs(trailingOnly = T, asValues = T)
dms_variants_file <- args[['dms_variants_file']]
output_file <- args[['output_file']]

# dms_variants_file <- '../annotations/fk_dms_opool.csv'
# output_file <- "../annotations/insert_annotations_2.csv"

### Load DMS oligo sequences 

In [15]:
dms_variants <- read_csv(dms_variants_file, show_col_types = F) %>% 
  # get rid of flanking sequences
  mutate(insert_seq = substr(nt, 19, nchar(nt) - 24)) %>%
  rename(id = name, wt_motif = aa) %>%
  select(id, wt_motif, mutstart, insert_seq)

dms_variants

id,wt_motif,mutstart,insert_seq
<chr>,<chr>,<dbl>,<chr>
fk8,FKFKFKFKFKFKFKFK,1,NNNAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,4,TTCNNNTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,7,TTCAAGNNNAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,10,TTCAAGTTTNNNTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,13,TTCAAGTTTAAGNNNAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,16,TTCAAGTTTAAGTTCNNNTTCAAATTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,19,TTCAAGTTTAAGTTCAAGNNNAAATTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,22,TTCAAGTTTAAGTTCAAGTTCNNNTTCAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,25,TTCAAGTTTAAGTTCAAGTTCAAANNNAAGTTTAAGTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,28,TTCAAGTTTAAGTTCAAGTTCAAATTCNNNTTTAAGTTCAAGTTCAAA


### Get NNN codons

In [16]:
NNN_codons <- names(GENETIC_CODE) %>% 
  as_tibble_col("codon")

### Parse NNN oligos into individual variants

In [17]:
dms_variants <- dms_variants %>% 
  filter(str_detect(insert_seq, "NNN")) %>% 
  mutate(codon = map(NNN_codons, function(x) x)) %>% 
  unnest(codon) %>% 
  mutate(insert_seq = str_replace(insert_seq, "NNN", codon)) %>%
  mutate(motif = as.character(translate(DNAStringSet(insert_seq), no.init.codon=T))) %>%
  add_row(id='spikein2', wt_motif=as.character(translate(DNAStringSet(spikein2), no.init.codon = T)), mutstart=NA, insert_seq=spikein2, codon=NA, motif=NA) %>%
  add_row(id='spikein5', wt_motif=as.character(translate(DNAStringSet(spikein5), no.init.codon = T)), mutstart=NA, insert_seq=spikein5, codon=NA, motif=NA)

### Write variant annotations to TSV file

In [18]:
variant_annotations <- dms_variants %>% 
  mutate(insert_num = seq(1, dplyr::n())) %>% 
  select(insert_num, insert_seq, everything()) %>%
  write_csv(output_file)

insert_num,insert_seq,id,wt_motif,mutstart,codon,motif
<int>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
1,TTTAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TTT,FKFKFKFKFKFKFKFK
2,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TTC,FKFKFKFKFKFKFKFK
3,TTAAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TTA,LKFKFKFKFKFKFKFK
4,TTGAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TTG,LKFKFKFKFKFKFKFK
5,TCTAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TCT,SKFKFKFKFKFKFKFK
6,TCCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TCC,SKFKFKFKFKFKFKFK
7,TCAAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TCA,SKFKFKFKFKFKFKFK
8,TCGAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TCG,SKFKFKFKFKFKFKFK
9,TATAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TAT,YKFKFKFKFKFKFKFK
10,TACAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCAAA,fk8,FKFKFKFKFKFKFKFK,1,TAC,YKFKFKFKFKFKFKFK
