# SynCDsGen: Synthetic Coding Sequences Generator (Demo)

The goal of this notebook is to show how to use the SynCDsGen module.

In [1]:
import torch
import SynCDsGen
import re
import numpy as np

## Set the generator configuration

### Automatically generate the generator configuration

Here we just have to set the number of codons, the number of amino acids and the codon length.

In [3]:
conf = SynCDsGen.SyncCDsGeneratorConf(nb_codons=20, nb_AAs=5, codon_length=2)

In [4]:
#attributes of the conf object
dir(conf)

['AAs',
 'AAs_initial_prob_dist',
 'AAs_stop_prob_dist',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'bases',
 'codon_length',
 'codons',
 'constraints_dict',
 'emission_prob_t',
 'get_AAs_initial_prob_dist',
 'get_AAs_stop_prob_dist',
 'is_AA_start_AA',
 'is_AA_stop_AA',
 'nb_AAs',
 'nb_bases',
 'nb_codons',
 'nb_start_codons',
 'nb_stop_codons',
 'start_codons',
 'stop_codons',
 'transition_prob_t',
 'translation_dict']

### Manually set the generator configuration

In [3]:
#define and configure the data generator
conf = SynCDsGen.SyncCDsGeneratorConf()

conf.bases = ['a', 'b', 'c']
conf.codons = ['aa', 'ab', 'ac', 'ba', 'bb', 'bc', 'ca', 'cb']
conf.start_codons = ['aa', 'ab']
conf.stop_codons = ['ac']
conf.AAs = ['A', 'B', 'C', 'D']
conf.translation_dict = {
    'A':  ['aa', 'ab'],
    'B': ['ac'],
    'C': ['ba', 'bc', 'bb'],
    'D': ['ca', 'cb']
}
conf.transition_prob_t = torch.Tensor([[0,  0, 1/2, 1/2],
                            [0, 0, 1/2, 1/2],
                            [0, 0, 1/2, 1/2],
                            [0, 0, 1/2, 1/2]])

conf.emission_prob_t = torch.Tensor([[1/2, 1/2, 0.0],
                          [1, 0.0, 0.0],
                          [0.6, 0.2, 0.2],
                          [0.7, 0.3, 0.0]])

## Generate data from a purely stochastic/Markov type model

In [6]:
generator = SynCDsGen.StochasticSynCDsGenerator(conf)
synthetic_data, _, _ = generator.sample(length=100, n_samples=[5], backup_dir="data")

Generation of 5 data points completed!


In [6]:
synthetic_data.head()

Unnamed: 0,AAs,CDs
0,ADCB,aacabaac
1,ADCB,aacabcac
2,ACDB,abcbcaac
3,ADCB,aabbbaac
4,ADCB,abcabcac


## Generate synthetic data with conditional dependence on previously translated sequence/patterns

In [4]:
conf.constraints_dict = {
    r".*A$": {
        'C': torch.Tensor([1, 0, 0])
    },
    r".*B$": {
        'C': torch.Tensor([0, 1, 0])
    }
}

In [7]:
generator = SynCDsGen.AutoregressiveSynCDsGenerator(conf)

synthetic_data, _, _= generator.sample(length=100, n_samples=[5], backup_dir="data")

Generation of 5 data points started!
Generation of 5 data points completed!


In [8]:
synthetic_data.head()

Unnamed: 0,AAs,CDs
0,AEEECDEEDDDCDEDCCCCDCCCCCCECECDDCDECCECECCDDDD...,abdacfdaafcacfcfccbbccadcbcebcadadaeaecaadaead...
1,ACECEEECEEEECCCCECEDCECEDEDDEDDDCDDDECEECDDEED...,aaadcdaecdcfceadcfcfcdcdaeadaeaedaadcebfadcfad...
2,ACDDDECCCEDEDCCEECCDEEDEDECCCCDCDECDDDDDEEDDDD...,aaadbfcacccfafadbacfcbcfcbaeafcedaafaebdcfcfbd...
3,ADCECCCCDDEDCEDCEEEDCDECECDDCEEDDCECCEDCDCCCEC...,abcbadcfaeaeadaecacadbccafcecbaececfdaccafbcda...
4,AEDECDCDDCEEEDDCCCECDCDEECDEEDEEEDCEDECDDCCEEC...,abcdbbceadbfaebdccafdadacfcbcaadadadcfadccadbb...
