In [3]:
pip install rpy2

Note: you may need to restart the kernel to use updated packages.


In [4]:
%load_ext rpy2.ipython
import rpy2.robjects as robjects
# R vector of strings
from rpy2.robjects.vectors import StrVector

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [None]:
 %%R
 if (!require("BiocManager", quietly = TRUE))
   install.packages("BiocManager")
 BiocManager::install(version = "3.10")
 BiocManager::install("Biostrings")

In [None]:
%%R

# # Проверка последовательностей
check_peptides <- function(sequences) {
  stopifnot(require('Biostrings'))
  alphabet <- Biostrings::AMINO_ACID_CODE[Biostrings::AA_STANDARD]

  sequences <-
    unique(sequences[!is.na(sequences) & sequences != ''])

  if (length(sequences) == 0L ||
      !is.character(sequences))
    stop('No sequences!')

  symbols <- unique(unlist(strsplit(sequences, split = '|')))
  extra <- setdiff(symbols, names(alphabet))

  if (length(extra) > 0L)
    stop(paste0('Extra symbols in sequences: ', paste0(extra, collapse = '; ')))

  sequences
}

In [35]:
%%R


# Разбиение списка пептидов на группы по 4 близкой длины
split_peptides <-
  function(sequences,
           optimize = c('length', 'composition'),
           batch_size = 4L) {
    sequences <-
      unique(unlist(sequences, recursive = TRUE, use.names = FALSE))

    stopifnot(all(
      is.integer(batch_size),
      length(batch_size) == 1L,
      batch_size >= 0L
    ))

    if ('length' %in% optimize)
      sequences <- sequences[order(nchar(sequences))]

    nseq <- length(sequences)

    if (batch_size > 0L) {
      nseq <- length(sequences)
      sequences <-
        split(sequences, rep(1L:ceiling(nseq / batch_size), each = batch_size)[1L:nseq])
    } else {
      sequences <- list(`1` = sequences)
    }


    sequences
  }


# Создание файлов сиквенсов для хоруса
make_chorus_peptides <-
  function(sequences,
           dir = 'data/peptide_sets/synthesis2',
           batch_name = '20240315') {
    alphabet <- Biostrings::AMINO_ACID_CODE[Biostrings::AA_STANDARD]
    stopifnot(all(
      is.character(batch_name),
      length(batch_name) == 1L,
      !is.na(batch_name)
    ))

    dir <- normalizePath(dir, mustWork = FALSE)
    if (!dir.exists(dir))
      dir.create(dir, recursive = TRUE, showWarnings = FALSE)
    if (!dir.exists(dir))
      stop(paste0('Can\'t create directory ', dir))


    paths <- lapply(names(sequences), function(i) {
      p <- sapply(sequences[[i]], function(s) {
        filename <- paste0(batch_name, '_', i, '_', s)
        filepath <- file.path(dir, paste0(filename, '.csv'))

        write(
          c('Type', paste0('sequence', '\r')),
          ncolumns = 2L,
          sep = ',',
          file = filepath,
          append = FALSE
        )
        write(
          c('fileName', paste0(filename, '\r')),
          ncolumns = 2L,
          sep = ',',
          file = filepath,
          append = TRUE
        )

        write(
          '\r',
          ncolumns = 1L,
          sep = ',',
          file = filepath,
          append = TRUE
        )

        write(
          c('singleLetterAAs', paste0('ThreeLetterAAs', '\r')),
          ncolumns = 2L,
          sep = ',',
          file = filepath,
          append = TRUE
        )

        s_split <- strsplit(s, '|')[[1L]]
        invisible(lapply(rev(s_split), function(r) {
          write(
            c(r, paste0(alphabet[r], '\r')),
            ncolumns = 2L,
            sep = ',',
            file = filepath,
            append = TRUE
          )
        }))

        filepath
      })
      names(p) <- sequences[[i]]
      p
    })

    names(paths) <- names(sequences)
    paths
  }
# Расчёт состава для одного пептида
seq_stat <- function(seq) {
  library('data.table')
  splits <- strsplit(seq, split = '|')[[1L]]
  counts <-
    dcast(setnames(as.data.table(table(
      head(splits, n = -1L)
    )),
    c('V1', 'N'),
    c('AA', 'count')),
    . ~ AA,
    value.var = 'count')[, `:=`(`.` = NULL,
                                sequence = seq,
                                reactions = nchar(seq) - 1L)]
  r <- tail(splits, n = 1L)
  counts[, resin_ := 1L]
  setnames(counts, 'resin_', paste0('resin_', r))

  setcolorder(counts, c('sequence', 'reactions', sort(colnames(counts)[grepl('resin', colnames(counts))]), sort(setdiff(
    colnames(counts), c('sequence', 'reactions', colnames(counts)[grepl('resin', colnames(counts))])
  ))))
}


# Расчёт количества задействованных реакторов на каждом шаге синтеза (нужно для предсказания времени)
steps_count <- function(reactions) {
  if (length(reactions) == 0L)
    return(data.table())
  reactions <- sort(reactions[reactions > 0L])
  if (length(reactions) == 0L)
    return(data.table())
  initial <- length(reactions)
  reactions <- reactions[reactions > 1L]
  if (length(reactions) == 0L)
    return(data.table(
      protocol = 'initial',
      rvs = initial,
      steps = 1L
    ))

  final <- sum(reactions == max(reactions))

  reactions[reactions == max(reactions)] <- max(reactions) - 1L
  reactions <- reactions[reactions > 1L]

  if (length(reactions) == 0L)
    return(rbindlist(
      list(
        data.table(
          protocol = 'initial',
          rvs = initial,
          steps = 1L
        ),
        data.table(
          protocol = 'final',
          rvs = final,
          steps = 1L
        )
      ),
      use.names = TRUE,
      fill = TRUE,
      idcol = NULL
    ))

  total_middle_steps <- max(reactions)
  step_fill <-
    sapply(2L:total_middle_steps, function(i)
      sum(reactions >= i))
  names(step_fill) <- 2L:total_middle_steps
  step_fill <- table(step_fill)

  rbindlist(
    list(
      data.table(
        protocol = 'initial',
        rvs = initial,
        steps = 1L
      ),
      setnames(as.data.table(step_fill), c('rvs', 'steps'))[, `:=`(protocol = 'middle', rvs = as.integer(rvs))],
      data.table(
        protocol = 'final',
        rvs = final,
        steps = 1L
      )
    ),
    use.names = TRUE,
    fill = TRUE,
    idcol = NULL
  )
}
           
#  Предсказание времени синтеза по количеству реакторов в каждом присоединении и протоколу
predict_timing <-
  function(steps,
           coefficients = data.table(
             protocol = c('initial', 'middle', 'final'),
             b = c(0, 1720, 0),
             k = c(960, 250, 1700)
           )) {
    merge(steps, coefficients, by = 'protocol')[, sum((k * rvs + b) * steps) / 60 / 60]
  }



In [36]:
%%R
# Расчёт состава и свойств для набора сиквенсов

compute_synthesis_statistics <- function(sequences,
                                         dir = NULL) {
  library('data.table')

  if (!is.null(dir)) {
    dir <- normalizePath(dir, mustWork = FALSE)
    if (!dir.exists(dir))
      dir.create(dir, recursive = TRUE, showWarnings = FALSE)
    if (!dir.exists(dir))
      stop(paste0('Can\'t create directory ', dir))
  }

  stat <-
    dcast(
      melt(
        rbindlist(
          lapply(unique(names(sequences)), function(b)
            rbindlist(
              lapply(sequences[[b]], seq_stat),
              use.names = TRUE,
              fill = TRUE,
              idcol = NULL
            )[, batch := b]),
          use.names = TRUE,
          fill = TRUE,
          idcol = NULL
        ),
        id.vars = c('sequence', 'batch', 'reactions'),
        value.factor = FALSE
      )[!is.na(value)],
      batch + sequence + reactions  ~ variable,
      value.var = 'value',
      fill = 0L
    )

  if (!is.null(dir))
    fwrite(stat, file.path(dir, 'peptide_stat.txt'), sep = '\t')

  batch_timing <-
    stat[, .(batch_time_h = predict_timing(steps_count(reactions)), rvs = length(reactions), steps = max(reactions)), batch]

  by_batch <-
    setcolorder(
      dcast(
        melt(
          stat,
          id.vars = c('batch', 'sequence'),
          variable.factor = FALSE
        )[, .(total_value = sum(value)), .(batch, variable)],
        batch ~ variable,
        value.var = 'total_value'
      ),
      c('batch', 'reactions', sort(setdiff(
        colnames(stat), c('batch', 'sequence', 'reactions')
      )))
    )

  by_batch <- merge(by_batch, batch_timing, by = 'batch')

  total <-
    setcolorder(
      dcast(
        melt(
          stat,
          id.vars = c('batch', 'sequence'),
          variable.factor = FALSE
        )[, .(batch = 'total', total_value = sum(value)), .(variable)],
        batch ~ variable,
        value.var = 'total_value'
      ),
      c('batch', 'reactions', sort(setdiff(
        colnames(stat), c('batch', 'sequence', 'reactions')
      )))
    )

  total[, batch_time_h := sum(by_batch[, batch_time_h])]

  batch_stats <-
    rbindlist(
      list(by_batch, total),
      use.names = TRUE,
      fill = TRUE,
      idcol = NULL
    )

  batch_stats[, batch_time_h := round(batch_time_h, digits = 1L)]

  setcolorder(batch_stats,
              c(
                'batch', 'rvs', 'steps',
                'reactions',
                'batch_time_h',
                colnames(batch_stats)[grepl('resin', colnames(batch_stats))],
                setdiff(
                  colnames(batch_stats),
                  c(
                    'batch', 'rvs', 'steps',
                    'reactions',
                    'batch_time_h',
                    colnames(batch_stats)[grepl('resin', colnames(batch_stats))]
                  )
                )
              ))
if (!is.null(dir))
    fwrite(batch_stats, file.path(dir, 'batch_stats.txt'), sep = '\t')

  list(stat, batch_stats)
}


In [37]:
rcheck_peptides = robjects.r['check_peptides']
rsplit_peptides = robjects.r['split_peptides']
rmake_chorus_peptides = robjects.r['make_chorus_peptides']
rcompute_synthesis_statistics = robjects.r['compute_synthesis_statistics']

#  ["ADTYRTYTAD", "ATYRTYTAD", "ADTYRYTAD"]

 
peptide_list = ["ADLFYDVEALDLESPK", "AFPALTSLDLSDNPGLGER", "AIETFSGK", "AIPVTQYLK", "DFDFVPPVVR","DFVQPPTK", "EHVAHLLFLR", 
                "EPAPTTPK", "ESDTSYVSLK", "FLVGPDGIPIMR", "GDFSSANNR", "HTLNQIDEVK", "ILDDLSPR", "LNILNNNYK", "SLDFTELDVAAEK", "TNQVNSGGVLLR",
                "VLDAVR", "VLSLAQEQVGGSPEK", "YLGEEYVK"]


batch_name = "PT001"

peptide_batches = rsplit_peptides(StrVector(peptide_list))
chorus_sequences = rmake_chorus_peptides(peptide_batches, dir = f'data/synthesis/{batch_name}', batch_name = batch_name)
rcompute_synthesis_statistics(peptide_batches, dir = f'data/synthesis/{batch_name}')
# print(result.r_repr())

0,1
[no name],[RTYPES.VECSXP]
[no name],[RTYPES.VECSXP]


In [38]:
result = rsplit_peptides(StrVector(peptide_list))
# result2 = rmake_chorus_peptides(result, dir = "./", batch_name = "test")
# print(result2.r_repr())

In [39]:
# %%R

# peptide_batches <- split_peptides(sequences)
# chorus_sequences <-
#   make_chorus_peptides(peptide_batches,
#                        dir = 'data/synthesis/clin9_20240402',
#                        batch_name = 'clin9_20240402')
# compute_synthesis_statistics(peptide_batches,
#                              dir = 'data/synthesis/clin9_20240402')

In [40]:
text.split("\n")

['ADLFYDVEALDLESPK',
 'AFPALTSLDLSDNPGLGER',
 'AIETFSGK',
 'AIPVTQYLK',
 'DFDFVPPVVR',
 'DFVQPPTK',
 'EHVAHLLFLR',
 'EPAPTTPK',
 'ESDTSYVSLK',
 'FLVGPDGIPIMR',
 'GDFSSANNR',
 'HTLNQIDEVK',
 'ILDDLSPR',
 'LNILNNNYK',
 'SLDFTELDVAAEK',
 'TNQVNSGGVLLR',
 'VLDAVR',
 'VLSLAQEQVGGSPEK',
 'YLGEEYVK']