# Imports and definitions

In [7]:
%load_ext autoreload

import pickle
import pandas as pd
import tensorflow as tf

from utils.preprocessing import process_seqs
from model.train_model import train_siamese_model
from model.models_cstm import get_embedding_model


def test_split(seqs_path, y_path, depth):
    """
    Helper function for generating cladistic train-test split analysis. The idea is to vary the depth,
    keeping other variables consistent. 
    """
    # Load data from known y matrix
    data = process_seqs(
        seqs_path=seqs_path,
        train_test_split='distance',
        split_depth=depth,
        test_size=0.2,
        val_size=0.2,
        load_y=y_path,
        verbose=True
    )
    
    # Get input/output sizes
    in_dim = data[0][0].shape[1]
    hidden_dim = int(in_dim * 0.75)
    out_dim = int(in_dim * 0.5)
    
    # Specifying model this way keeps hyperparameters consistent
#     optimizer = tf.keras.optimizers.Adam(
#         learning_rate=1e-2, 
#     )
    optimizer = tf.keras.optimizers.Adam( # hyperbolic hyperparams
        learning_rate=1,
        clipnorm=1
    )
    model, score, history = train_siamese_model(
        data,
        embedding_model=get_embedding_model(
            in_dim=in_dim, 
            out_dim=out_dim,
            model_choice='LINEAR',
#             mlp_num_units_hidden=hidden_dim
        ),
        optimizer=optimizer,
        distance_metric='hyperbolic',
        epochs=2,
        batch_size=512,
    )
    return model, score, history

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# PheS case

In [3]:
!mkdir ./data/phes
!wget -O ./data/phes/phes_na.fa https://www.dropbox.com/s/fdto3lznea8zqyk/phes_na.fa
!wget -O ./data/phes/y.pkl https://www.dropbox.com/s/8nt968mahec1urf/y.pkl?dl=0

mkdir: cannot create directory ‘./data/phes’: File exists
--2021-12-20 06:48:47--  https://www.dropbox.com/s/fdto3lznea8zqyk/phes_na.fa
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6019:18::a27d:412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/fdto3lznea8zqyk/phes_na.fa [following]
--2021-12-20 06:48:48--  https://www.dropbox.com/s/raw/fdto3lznea8zqyk/phes_na.fa
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc4a656025959d0216666741cf57.dl.dropboxusercontent.com/cd/0/inline/BcMdLoSqQo_I898x2oAXDi3rj-2FN10RGD4PMrq5GaEYZ2T1pCaIYAMqw4uvwu-xIXtCjGX6m6dMh8d9AolrcVlIB0yBW76eb5p91P9HjcZMcnjUn9pSizvFZp9jDYwIOuTQyfCY-SgGwVI862RWwOwb/file# [following]
--2021-12-20 06:48:48--  https://uc4a656025959d0216666741cf57.dl.dropboxusercontent.com/cd/0/inline/BcMdLoSqQo_I898x2oAXDi3rj-2FN10RGD4PMr

In [4]:
phes_fasta_path = "./data/phes/phes_na.fa"
phes_y_path = "./data/phes/y.pkl"
history_path="./results/train_test/phes"

In [5]:
!mkdir ./results/train_test/phes

mkdir: cannot create directory ‘./results/train_test/phes’: File exists


In [8]:
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(1,9):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(phes_fasta_path, phes_y_path, depth)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}.tf", save_format="tf")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")

DEPTH = 1: ####################################################################################################
Reading inputs...
	Done in 3.342 seconds
	Shape of X: (7010, 1821)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.750 seconds
	Shapes of data: (4205, 1821), (1402, 1821), (1403, 1821)
Epoch 1/2
Epoch 2/2
	SCORE: 314984.375
INFO:tensorflow:Assets written to: ./results/train_test/phes/model_1.tf/assets
DEPTH = 2: ####################################################################################################
Reading inputs...
	Done in 3.257 seconds
	Shape of X: (7010, 1821)
Loading distances from ./data/phes/y.pkl
Splitting X values...
	Done in 0.543 seconds
	Shapes of data: (4206, 1821), (1402, 1821), (1402, 1821)
Epoch 1/2
Epoch 2/2
	SCORE: 331213.40625
INFO:tensorflow:Assets written to: ./results/train_test/phes/model_2.tf/assets
DEPTH = 3: ####################################################################################################
Read

Epoch 1/2
Epoch 2/2
	SCORE: 357259.40625
INFO:tensorflow:Assets written to: ./results/train_test/phes/model_8.tf/assets


# 16S rRNA case

In [9]:
!mkdir ./data/16s
!wget -O ./data/16s/16s_na.fa https://www.dropbox.com/s/uauwqmdjy41rf9t/16s_na.fa
!wget -O ./data/16s/y.pkl https://www.dropbox.com/s/598zhxhlhjq5hhh/y.pkl

--2021-12-20 08:17:36--  https://www.dropbox.com/s/uauwqmdjy41rf9t/16s_na.fa
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:601b:18::a27d:812
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/uauwqmdjy41rf9t/16s_na.fa [following]
--2021-12-20 08:17:36--  https://www.dropbox.com/s/raw/uauwqmdjy41rf9t/16s_na.fa
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucfe1c41f4bef4d63e16b84f0d6c.dl.dropboxusercontent.com/cd/0/inline/BcPGWiCZ7gB6qZqo6MIrJiPmwmsbWeSVpUOOR92PoMDC-GMx6GVM8o6O44lssxNIUCUnKVf1bjdU2RuyicvJtHn-WObN0j5Ydbtu2hpE18QYwdlBlCYHTKwrq4XVeIP0oYeAJ-wmYrkQt8xGaHkC-W43/file# [following]
--2021-12-20 08:17:36--  https://ucfe1c41f4bef4d63e16b84f0d6c.dl.dropboxusercontent.com/cd/0/inline/BcPGWiCZ7gB6qZqo6MIrJiPmwmsbWeSVpUOOR92PoMDC-GMx6GVM8o6O44lssxNIUCUnKVf1bjdU2RuyicvJtHn-WObN0j5Ydbt

In [10]:
rna_fasta_path = "./data/16s/16s_na.fa"
rna_y_path = "./data/16s/y.pkl"
history_path="./results/train_test/16s"

In [11]:
!mkdir ./results/train_test/16s

In [None]:
scores = pd.DataFrame(columns=["depth", "score"])
for depth in range(1,9):
    # Running function
    print(f"DEPTH = {depth}:", "#"*100)
    model, score, history = test_split(rna_fasta_path, rna_y_path, depth)
    print(f"\tSCORE: {score}")
    
    # Saving
    model.save(f"{history_path}/model_{depth}.tf", save_format="tf")
    pd.DataFrame(history.history).to_pickle(f"{history_path}/history_{depth}.pkl")
    scores = scores.append({"depth" : depth, "score" : score}, ignore_index=True)

scores.to_pickle(f"{history_path}/scores.pkl")

DEPTH = 1: ####################################################################################################
Reading inputs...
	Done in 11.927 seconds
	Shape of X: (16861, 3232)
Loading distances from ./data/16s/y.pkl
