# Imports and definitions

In [1]:
%load_ext autoreload

from utils.preprocessing import process_seqs
from model.train_model import train_siamese_model
from model.models_cstm import get_embedding_model
import pickle
import pandas as pd

def test_split(seqs_path, y_path, depth):
    """
    Helper function for generating cladistic train-test split analysis. The idea is to vary the depth,
    keeping other variables consistent. 
    """
    # Load data from known y matrix
    data = process_seqs(
        seqs_path=seqs_path,
        train_test_split='distance',
        split_depth=depth,
        test_size=0.2,
        val_size=0.2,
        load_y=y_path,
        lim=1000,
        verbose=True
    )
    
    # Get input/output sizes
    in_dim = data[0][0].shape[1]
    out_dim = int(in_dim / 2)
    
    # Specifying model this way keeps hyperparameters consistent
    model, score, history = train_siamese_model(
        data,
        embedding_model=get_embedding_model(in_dim=in_dim, out_dim=out_dim),
        distance_metric='euclidean',
        epochs=1,
        batch_size=512
    )
    return model, score, history

# PheS case

In [2]:
phes_fasta_path = "./data/phes/phes_na.fa"
phes_y_path = "./data/phes/y.pkl"
history_path="./results/train_test/phes"

In [3]:
# max_depth = np.ceil(np.log2(0.2 * n_seqs))
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(12):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(phes_fasta_path, phes_y_path, depth)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")

DEPTH = 0: ####################################################################################################
Reading inputs...
	Done in 0.609 seconds
	Shape of X: (1000, 1674)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.110 seconds
	Shapes of data: (600, 1674), (200, 1674), (200, 1674)


2021-12-17 19:42:21.870138: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-17 19:42:22.382457: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


	SCORE: 9410.529296875


2021-12-17 19:43:52.610151: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./results/train_test/phes/model_0/assets
DEPTH = 1: ####################################################################################################
Reading inputs...
	Done in 0.632 seconds
	Shape of X: (1000, 1674)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.122 seconds
	Shapes of data: (600, 1674), (200, 1674), (200, 1674)
	SCORE: 8440.3974609375
INFO:tensorflow:Assets written to: ./results/train_test/phes/model_1/assets
DEPTH = 2: ####################################################################################################
Reading inputs...
	Done in 0.701 seconds
	Shape of X: (1000, 1674)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.198 seconds
	Shapes of data: (599, 1674), (200, 1674), (201, 1674)
	SCORE: 9571.345703125
INFO:tensorflow:Assets written to: ./results/train_test/phes/model_2/assets
DEPTH = 3: ##########################################################################

# 16S rRNA case

In [4]:
rna_fasta_path = "./data/16s/16s_na.fa"
rna_y_path = "./data/16s/y.pkl"
history_path="./results/train_test/16s"

In [7]:
# max_depth = np.ceil(np.log2(0.2 * n_seqs))
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(12):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(rna_fasta_path, rna_y_path, depth)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")

DEPTH = 0: ####################################################################################################
Reading inputs...
	Done in 1.050 seconds
	Shape of X: (1000, 1803)
Loading distances from ./data/16s/y.pkl
Splitting X values...
	Done in 0.861 seconds
	Shapes of data: (600, 1803), (200, 1803), (200, 1803)
	SCORE: 6233.18896484375
INFO:tensorflow:Assets written to: ./results/train_test/16s/model_0/assets
DEPTH = 1: ####################################################################################################
Reading inputs...
	Done in 1.069 seconds
	Shape of X: (1000, 1803)
Loading distances from ./data/16s/y.pkl
Splitting X values...
	Done in 1.291 seconds
	Shapes of data: (600, 1803), (200, 1803), (200, 1803)
	SCORE: 5346.96826171875
INFO:tensorflow:Assets written to: ./results/train_test/16s/model_1/assets
DEPTH = 2: ####################################################################################################
Reading inputs...
	Done in 1.186 seconds
	Shape of

FileNotFoundError: [Errno 2] No such file or directory: './data/16s/16s_na.fa'