In [5]:
import os 
# os.environ['CUDA_VISIBLE_DEVICES'] = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'


from utils.utils_cpp import cpp_predictor
# from utils.utils_cpp import cpp_generator
# from utils.utils_cpp import cpp_optimizer
from utils.utils_common.activator import Activation

In [13]:
import pandas as pd

data = pd.read_csv('./dataset/data_gene/gene_predictor_dataset.csv')
data['seq'] = data['seq'].str.upper()
data.to_csv('./dataset/data_gene/gene_predictor_dataset.csv',index=False)

#### Filepaths

*_DATA_PATH : For datasets, to be used when training and sampling <br>
*_MODEL_PATH : For models, to be used when training to save the model, otherwise to load pre-trained models <br>
PREDICTOR_STATS_PATH : To save/load statistics for predictor training dataset <br>

SEQ_MAX : Maximum sequence length for predictor <br>
SEED_SEQ_LENGTH : Seed sequence length for generator <br>

SMILES_PATH : SMILES for monomers <br>
FP_RADIUS : Radius of topological exploration for fingerprint <br>
FP_BITS : Size of fingerprint bit-vector

In [14]:
# GENERATOR_DATA_PATH = './dataset/data_cpp/cpp_generator_dataset.txt'
# GENERATOR_MODEL_PATH = './model/model_cpp/cpp_generator.hdf5'
# SEED_SEQ_LENGTH = 10

PREDICTOR_DATA_PATH = './dataset/data_gene/gene_predictor_dataset.csv'
PREDICTOR_MODEL_PATH = './model/model_gene/gene_predictor.hdf5'
PREDICTOR_STATS_PATH = './dataset/data_gene/gene_predictor_dataset_stats.json'

SMILES_PATH = './dataset/data_gene/nuc2smiles.json'
FP_RADIUS = 3
FP_BITS = 1024
SEQ_MAX = 256 #108

Creating an instance for Generator class with the dataset. <br>
Training and saving the generator

#### Predictor

Creating an instance for Predictor class with the dataset and other parameters. <br>
Training and saving the predictor

In [16]:
predictor = cpp_predictor.Predictor(
    data_path = PREDICTOR_DATA_PATH,
    smiles_path = SMILES_PATH,
    fp_radius = FP_RADIUS,
    fp_bits = FP_BITS,
    seq_max = SEQ_MAX
)

predictor.train_model(
    model_params = {
        'save_checkpoint': True,
        'checkpoint_filepath': './model/'
        }
)

Loading Data for Training of Predictor
Featurizing Data for Predictor


#### Optimizer

Creating an instance for Optimizer class with the pre-trained models and data files. <br>
Sampling sequences using a pre-trained generator to seed the genetic algorithm. <br>
Optimizing the seed sequences.

In [None]:
optimizer = cpp_optimizer.Optimizer(
    model_path = PREDICTOR_MODEL_PATH,
    data_path = PREDICTOR_DATA_PATH,
    smiles_path = SMILES_PATH,
    stats_path = PREDICTOR_STATS_PATH,
    fp_radius = FP_RADIUS,
    fp_bits = FP_BITS,
    seq_max = SEQ_MAX
)

In [None]:
generator = cpp_generator.Generator(
    model_path = GENERATOR_MODEL_PATH,
    data_path = GENERATOR_DATA_PATH,
    seq_length = SEED_SEQ_LENGTH
)

In [None]:
list_seeds = generator.generate_seed(n_seeds = 2, seed_length = 30)

In [None]:
df = optimizer.optimize(list_seeds)
df.head(2)

#### Activation Analysis

Visualizing the gradient activation of peptide sequence (ex. penetratin) based on pre-trained predictor.

In [None]:
activator = Activation(
    mode = 'cpp',
    model_path = PREDICTOR_MODEL_PATH,
    smiles_path = SMILES_PATH,
    stats_path = PREDICTOR_STATS_PATH,
    fp_radius = FP_RADIUS,
    fp_bits = FP_BITS,
    seq_max = SEQ_MAX
)

activator.analyze('RQIKIWFQNRRMKWKK')