## Get all variants of each DMS motif

### Load libraries

In [1]:
suppressPackageStartupMessages(library(R.utils))
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(library(plyranges))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(rasilabRtemplates))

### Get command line arguments

In [2]:
args <- R.utils::commandArgs(trailingOnly = T, asValues = T)
dms_variants_file <- args[['dms_variants_file']]
non_dms_variants_file <- args[['non_dms_variants_file']]
output_file <- args[['output_file']]

### Load non-DMS oligo sequences 

In [3]:
non_dms_variants <- read_tsv(non_dms_variants_file, show_col_types = F)  %>%
  rename(insert_seq = nt63) %>%
  select(-orf_classification, -positive, -bulky, -p, -g, -d, -strength)

non_dms_variants

motif,insert_seq,type,id,gene,class,loc
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
KRAAARQAMAAAKAKAKKAAA,AAGCGTGCAGCAGCAAGGCAGGCAATGGCAGCAGCAAAAGCAAAAGCCAAAAAAGCAGCAGCA,posbulky,YAR047C,,endo_ala_cntrl,
TREARAPSQAAAARRARAARR,ACTAGAGAAGCTCGTGCACCATCCCAAGCAGCAGCAGCAAGAAGAGCACGTGCAGCAAGAAGA,posbulky,YBL027W,RPL19B,endo_ala_cntrl,
AATKAKKKKAANRAKAQKANA,GCAGCAACAAAAGCAAAAAAGAAGAAAGCAGCAAATAGGGCTAAAGCACAAAAGGCAAATGCA,FK,YBR129C,OPY1,endo_ala_cntrl,
KAAGKKDAKMAARARKAAREA,AAAGCAGCAGGTAAAAAGGATGCAAAAATGGCAGCAAGAGCAAGAAAGGCAGCAAGGGAAGCA,FK,YCL054W,SPB1,endo_ala_cntrl,
KAPRSSAAAENKAAKAKKAKR,AAAGCACCTAGATCATCGGCAGCAGCAGAGAATAAAGCAGCAAAGGCAAAAAAGGCAAAGAGG,posbulky,YDL030W,PRP9,endo_ala_cntrl,
⋮,⋮,⋮,⋮,⋮,⋮,⋮
KRRKRRRKRSPRKRRKRRNKD,AAGAGAAGAAAGAGAAGAAGGAAAAGAAGTCCAAGAAAGAGAAGAAAGAGAAGAAATAAGGAT,posbulky,YOR309C,,endo_kr_cntrl,76
STAGADAKAKTAEARGKKARAQAA,TCCACCGCAGGTGCAGACGCAAAAGCAAAAACTGCAGAGGCAAGAGGAAAGAAAGCAAGAGCACAGGCAGCA,RAB12,RAB12,RAB12,human_ala_cntrl,
AAAAAADRAARAKATARAKRATTH,GCAGCAGCCGCAGCTGCAGACCGAGCAGCACGGGCAAAGGCAACCGCAAGAGCAAAGAGGGCAACCACTCAC,ADORA3,ADORA3,ADORA3,human_ala_cntrl,
STVGVDFKIKTVELRGKKIRLQIW,TCCACCGTGGGTGTTGACTTCAAAATCAAAACTGTAGAGCTAAGAGGAAAGAAAATTAGATTACAGATCTGG,RAB12,RAB12,RAB12,human_stall,


### Load DMS oligo sequences 

In [4]:
dms_variants <- read_csv(dms_variants_file, show_col_types = F) %>% 
  # get rid of flanking sequences
  mutate(insert_seq = substr(nt, 19, nchar(nt) - 24)) %>%
  rename(id = name, wt_motif = aa) %>%
  select(id, wt_motif, mutstart, insert_seq)

dms_variants

id,wt_motif,mutstart,insert_seq
<chr>,<chr>,<dbl>,<chr>
sdd1,DFFYEDYLIFDCRAKRRKKLK,1,NNKTTCTTTTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG
sdd1,DFFYEDYLIFDCRAKRRKKLK,4,GATNNKTTTTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG
sdd1,DFFYEDYLIFDCRAKRRKKLK,7,GATTTCNNKTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG
sdd1,DFFYEDYLIFDCRAKRRKKLK,10,GATTTCTTTNNKGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG
sdd1,DFFYEDYLIFDCRAKRRKKLK,13,GATTTCTTTTATNNKGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG
⋮,⋮,⋮,⋮
fk8,FKFKFKFKFKFKFKFK,34,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTNNKTTCAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,37,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGNNKAAGTTCAAA
fk8,FKFKFKFKFKFKFKFK,40,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCNNKTTCAAA
fk8,FKFKFKFKFKFKFKFK,43,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGNNKAAA


### Get NNK codons

In [5]:
NNK_codons <- names(GENETIC_CODE) %>% 
  as_tibble_col("codon") %>% 
  filter(str_detect(codon, "G$|T$"))

NNK_codons  

codon
<chr>
TTT
TTG
TCT
TCG
TAT
⋮
GCG
GAT
GAG
GGT


### Parse NNK oligos into individual variants

In [6]:
dms_variants <- dms_variants %>% 
  filter(str_detect(insert_seq, "NNK")) %>% 
  mutate(codon = map(NNK_codons, function(x) x)) %>% 
  unnest(codon) %>% 
  mutate(insert_seq = str_replace(insert_seq, "NNK", codon)) %>%
  mutate(motif = as.character(translate(DNAStringSet(insert_seq))))

dms_variants

id,wt_motif,mutstart,insert_seq,codon,motif
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
sdd1,DFFYEDYLIFDCRAKRRKKLK,1,TTTTTCTTTTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG,TTT,FFFYEDYLIFDCRAKRRKKLK
sdd1,DFFYEDYLIFDCRAKRRKKLK,1,TTGTTCTTTTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG,TTG,MFFYEDYLIFDCRAKRRKKLK
sdd1,DFFYEDYLIFDCRAKRRKKLK,1,TCTTTCTTTTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG,TCT,SFFYEDYLIFDCRAKRRKKLK
sdd1,DFFYEDYLIFDCRAKRRKKLK,1,TCGTTCTTTTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG,TCG,SFFYEDYLIFDCRAKRRKKLK
sdd1,DFFYEDYLIFDCRAKRRKKLK,1,TATTTCTTTTATGAAGATTATTTGATTTTCGATTGCAGAGCGAAACGACGAAAAAAATTGAAG,TAT,YFFYEDYLIFDCRAKRRKKLK
⋮,⋮,⋮,⋮,⋮,⋮
fk8,FKFKFKFKFKFKFKFK,46,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGCG,GCG,FKFKFKFKFKFKFKFA
fk8,FKFKFKFKFKFKFKFK,46,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGAT,GAT,FKFKFKFKFKFKFKFD
fk8,FKFKFKFKFKFKFKFK,46,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGAG,GAG,FKFKFKFKFKFKFKFE
fk8,FKFKFKFKFKFKFKFK,46,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGGT,GGT,FKFKFKFKFKFKFKFG


### Write variant annotations to TSV file

In [7]:
variant_annotations <- non_dms_variants %>% 
  bind_rows(dms_variants) %>% 
  mutate(insert_num = seq(1, dplyr::n())) %>% 
  select(insert_num, insert_seq, everything()) %>%
  write_csv(output_file)

variant_annotations  

insert_num,insert_seq,motif,type,id,gene,class,loc,wt_motif,mutstart,codon
<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>
1,AAGCGTGCAGCAGCAAGGCAGGCAATGGCAGCAGCAAAAGCAAAAGCCAAAAAAGCAGCAGCA,KRAAARQAMAAAKAKAKKAAA,posbulky,YAR047C,,endo_ala_cntrl,,,,
2,ACTAGAGAAGCTCGTGCACCATCCCAAGCAGCAGCAGCAAGAAGAGCACGTGCAGCAAGAAGA,TREARAPSQAAAARRARAARR,posbulky,YBL027W,RPL19B,endo_ala_cntrl,,,,
3,GCAGCAACAAAAGCAAAAAAGAAGAAAGCAGCAAATAGGGCTAAAGCACAAAAGGCAAATGCA,AATKAKKKKAANRAKAQKANA,FK,YBR129C,OPY1,endo_ala_cntrl,,,,
4,AAAGCAGCAGGTAAAAAGGATGCAAAAATGGCAGCAAGAGCAAGAAAGGCAGCAAGGGAAGCA,KAAGKKDAKMAARARKAAREA,FK,YCL054W,SPB1,endo_ala_cntrl,,,,
5,AAAGCACCTAGATCATCGGCAGCAGCAGAGAATAAAGCAGCAAAGGCAAAAAAGGCAAAGAGG,KAPRSSAAAENKAAKAKKAKR,posbulky,YDL030W,PRP9,endo_ala_cntrl,,,,
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
1370,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGCG,FKFKFKFKFKFKFKFA,,fk8,,,,FKFKFKFKFKFKFKFK,46,GCG
1371,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGAT,FKFKFKFKFKFKFKFD,,fk8,,,,FKFKFKFKFKFKFKFK,46,GAT
1372,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGAG,FKFKFKFKFKFKFKFE,,fk8,,,,FKFKFKFKFKFKFKFK,46,GAG
1373,TTCAAGTTTAAGTTCAAGTTCAAATTCAAGTTTAAGTTCAAGTTCGGT,FKFKFKFKFKFKFKFG,,fk8,,,,FKFKFKFKFKFKFKFK,46,GGT
