# Imports and definitions

In [14]:
%load_ext autoreload

from utils.preprocessing import process_seqs
from model.train_model import train_siamese_model
from model.models_cstm import get_embedding_model
import pickle

def test_split(seqs_path, y_path, depth):
    data = process_seqs(
        seqs_path=seqs_path,
        train_test_split='distance',
        split_depth=depth,
        test_size=0.2,
        val_size=0.2,
        load_y=y_path,
        lim=1000,
        verbose=True
    )
    in_dim = data[0][0].shape[1] # size of sequence
    out_dim = int(in_dim / 2)
    model, score, history = train_siamese_model(
        data,
        embedding_model=get_embedding_model(in_dim=in_dim, out_dim=out_dim),
        distance_metric='euclidean',
        epochs=1,
        batch_size=512
    )
    return model, score, history

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# PheS case

In [23]:
phes_fasta_path = "./data/phes/phes_na.fa"
phes_y_path = "./data/phes/y.pkl"
history_path="./results/train_test/phes"

In [None]:
# max_depth = np.ceil(np.log2(0.2 * n_seqs))
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(12):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(phes_fasta_path, phes_y_path, 0)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")

DEPTH = 0: ####################################################################################################
Reading inputs...
	Done in 0.627 seconds
	Shape of X: (1000, 1674)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.418 seconds
	Shapes of data: (600, 1674), (200, 1674), (200, 1674)
	SCORE: 6795.130859375
INFO:tensorflow:Assets written to: ./results/train_test/phes/model_0/assets
DEPTH = 1: ####################################################################################################
Reading inputs...
	Done in 0.489 seconds
	Shape of X: (1000, 1674)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.277 seconds
	Shapes of data: (600, 1674), (200, 1674), (200, 1674)
	SCORE: 9112.9375
INFO:tensorflow:Assets written to: ./results/train_test/phes/model_1/assets
DEPTH = 2: ####################################################################################################
Reading inputs...
	Done in 0.548 seconds
	Shape of X: (

In [None]:
rna_fasta_path = "./data/16s/16s_na.fa"
rna_y_path = "./data/16s/y.pkl"
history_path="./results/train_test/16s"

In [None]:
# max_depth = np.ceil(np.log2(0.2 * n_seqs))
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(12):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(rna_fasta_path, rna_y_path, 0)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")