# SynCDsGen: Synthetic Coding Sequences Generator (Demo)

The goal of this notebook is to show how to use the SynCDsGen module.

In [1]:
from hmmlearn import hmm

In [1]:
import torch
import SynCDsGen
import re
import numpy as np

## Generate data from a purely stochastic/Markov type model

In [7]:
#define and configure the data generator
conf = SynCDsGen.SyncCDsGeneratorConf(nb_codons=30, nb_AAs=4, codon_length=2)

In [3]:
conf.transition_prob_t

tensor([[0.0000, 0.0000, 0.5000, 0.5000],
        [0.0000, 0.0000, 0.5000, 0.5000],
        [0.0000, 0.0000, 0.5000, 0.5000],
        [0.0000, 0.0000, 0.5000, 0.5000]])

In [8]:
conf.emission_prob_t

tensor([[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000],
        [1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000],
        [0.0938, 0.0599, 0.0154, 0.0956, 0.0241, 0.0314, 0.0942, 0.0336, 0.0049,
         0.0642, 0.0316, 0.0167, 0.0088, 0.0017, 0.0547, 0.0905, 0.0887, 0.0734,
         0.0583, 0.0421, 0.0016, 0.0151],
        [0.0614, 0.0933, 0.3326, 0.3367, 0.1761, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000]])

In [5]:
generator = SynCDsGen.StochasticSynCDsGenerator(conf)

In [9]:
generator.sample(n_samples=250000, length=1000)

KeyboardInterrupt: 

In [2]:
#define and configure the data generator
conf = SynCDsGen.SyncCDsGeneratorConf()

conf.bases = ['a', 'b', 'c']
conf.codons = ['aa', 'ab', 'ac', 'ba', 'bb', 'bc', 'ca', 'cb']
conf.start_codons = ['aa', 'ab']
conf.stop_codons = ['ac']
conf.AAs = ['A', 'B', 'C', 'D']
conf.translation_dict = {
    'A':  ['aa', 'ab'],
    'B': ['ac'],
    'C': ['ba', 'bc', 'bb'],
    'D': ['ca', 'cb']
}
conf.transition_prob_t = torch.Tensor([[0,  0, 1/2, 1/2],
                            [0, 0, 1/2, 1/2],
                            [0, 0, 1/2, 1/2],
                            [0, 0, 1/2, 1/2]])

conf.emission_prob_t = torch.Tensor([[1/2, 1/2, 0.0],
                          [1, 0.0, 0.0],
                          [0.6, 0.2, 0.2],
                          [0.7, 0.3, 0.0]])

In [6]:
import torch

tensor = torch.randn(3, 3)

In [8]:
tensor[:, 1] = 1

tensor

tensor([[ 5.9571e-01,  1.0000e+00,  1.3763e-03],
        [-7.2472e-01,  1.0000e+00, -4.3362e-01],
        [-1.5265e+00,  1.0000e+00, -2.8351e-01]])

In [4]:
generator = SynCDsGen.StochasticSynCDsGenerator(conf)
synthetic_data, _, _ = generator.sample(length=100, n_samples=250000)

In [6]:
synthetic_data.head()

Unnamed: 0,AAs,CDs
0,ADCB,aacabaac
1,ADCB,aacabcac
2,ACDB,abcbcaac
3,ADCB,aabbbaac
4,ADCB,abcabcac


## Generate synthetic data with conditional dependence on previously translated sequence/patterns

In [2]:
conf = SynCDsGen.SyncCDsGeneratorConf()

conf.bases = ['a', 'b', 'c']
conf.codons = ['aa', 'ab', 'ac', 'ba', 'ca', 'bb', 'bc', 'cb', 'cc']
conf.start_codons = ['aa', 'ab']
#STOP_CODONS = ['cc']
conf.AAs = ['A', 'B', 'C', 'D']
conf.translation_dict = {
    'A':  ['aa', 'ab'],
    'B': ['ac'],
    'C': ['ba', 'bc', 'cb'],
    'D': ['ca', 'bb']
}
conf.transition_prob_t = torch.Tensor([[0,  1/3, 1/3, 1/3],
                            [0, 1/3, 1/3, 1/3],
                            [0, 1/3, 1/3, 1/3],
                            [0, 1/3, 1/3, 1/3]])

conf.emission_prob_t = torch.Tensor([[1/2, 1/2, 0.0],
                          [1, 0.0, 0.0],
                          [0.6, 0.2, 0.2],
                          [0.7, 0.3, 0.0]])

conf.constraints_dict = {
    r".*A$": {
        'C': torch.Tensor([1, 0, 0])
    },
    r".*B$": {
        'C': torch.Tensor([0, 1, 0])
    }
}

In [3]:
generator = SynCDsGen.AutoregressiveSynCDsGenerator(conf)

synthetic_data = generator.sample()

In [4]:
synthetic_data.head()

Unnamed: 0,AAs,CDs
0,ADBCCCCBCDBCDDCCCCCCDDBBBCDCDCCCBCDCDDCDCDDDBCCDB,abbbacbccbbcbaacbcbbacbccabbbababababacbcacaac...
1,ACBDDDCCCDCCDCCDBBBCBDCBDBDCBBDBBDCDBDBCDCCCBDDCB,aabaaccacacacbbcbacababacababacaacacacbcaccabc...
2,ADDBCCCDBDCCDCDBDBDCCDDBBCBCCBDDBCBCBCBDDDDBDCDBB,abbbcaacbcbccbcaaccabacbcababbaccaaccabababbca...
3,ABCDCDCBCBCCCBBCCCBBBCCBBDBDBCDCCBBCBDBBCBDDDCBDB,aaacbccabacabcacbcacbcbacbacacbcbcbaacacacbcba...
4,ACBDCBBDCDCBDCDCDDCBBCCCDBCBCDBCBCBDDBCBDDCCCBBDB,abbaaccabcacaccabacabaaccababbbacabbbaacacbcba...
