# Imports and definitions

In [7]:
%load_ext autoreload

import pickle
import pandas as pd
import tensorflow as tf

from utils.preprocessing import process_seqs
from model.train_model import train_siamese_model
from model.models_cstm import get_embedding_model


def test_split(seqs_path, y_path, depth):
    """
    Helper function for generating cladistic train-test split analysis. The idea is to vary the depth,
    keeping other variables consistent. 
    """
    # Load data from known y matrix
    data = process_seqs(
        seqs_path=seqs_path,
        train_test_split='distance',
        split_depth=depth,
        test_size=0.2,
        val_size=0.2,
        load_y=y_path,
        verbose=True
    )
    
    # Get input/output sizes
    in_dim = data[0][0].shape[1]
#     hidden_dim = int(in_dim * 0.75)
    out_dim = int(in_dim * 0.5)
    
    # Specifying model this way keeps hyperparameters consistent
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=1e-2, 
#         clipnorm=1,
    )
    model, score, history = train_siamese_model(
        data,
        embedding_model=get_embedding_model(
            in_dim=in_dim, 
            out_dim=out_dim,
            model_choice='CNN'
#             model_choice='MLP',
#             mlp_num_units_hidden = hidden_dim
        ),
        optimizer=optimizer,
        distance_metric='euclidean',
        epochs=2,
        batch_size=256,
    )
    return model, score, history

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
tf.config.list_physical_devices('GPU')

[]

# PheS case

In [2]:
phes_fasta_path = "./data/phes/phes_na.fa"
phes_y_path = "./data/phes/y.pkl"
history_path="./results/train_test/phes"

In [3]:
!mkdir ./results/train_test/phes

mkdir: cannot create directory ‘./results/train_test/phes’: File exists


In [4]:
# max_depth = np.ceil(np.log2(0.2 * n_seqs))
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(1,10):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(phes_fasta_path, phes_y_path, depth)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}.tf", save_format="tf")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")

DEPTH = 1: ####################################################################################################
Reading inputs...
	Done in 3.311 seconds
	Shape of X: (7010, 1821)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.762 seconds
	Shapes of data: (4205, 1821), (1403, 1821), (1402, 1821)
Epoch 1/2
   15/34526 [..............................] - ETA: 15:25:49 - loss: 132447.4531

KeyboardInterrupt: 

# 16S rRNA case

In [3]:
rna_fasta_path = "./data/16s/16s_na.fa"
rna_y_path = "./data/16s/y.pkl"
history_path="./results/train_test/16s"

In [4]:
!mkdir ./results/train_test/16s

mkdir: cannot create directory ‘./results/train_test/16s’: File exists


In [5]:
# max_depth = np.ceil(np.log2(0.2 * n_seqs))
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(1,10):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(rna_fasta_path, rna_y_path, depth)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}.tf", save_format="tf")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")

DEPTH = 0: ####################################################################################################
Reading inputs...
	Done in 10.607 seconds
	Shape of X: (16861, 3232)
Loading distances from ./data/16s/y.pkl


KeyboardInterrupt: 