In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

In [31]:
hn_dir = "../../out/201027/embedding/"
models_dir  = ["lstm5/","seq2seq/", "transformer_encoder/"]
data_dir = ["motor_toolkit","pfamA_motors_balanced","pfamA_random","pfamA_target"]
tuning_dir = ["evotune_balanced","evotune_balanced_target","mini_balanced","mini_balanced_target","raw"]
tsne_out_dir = "../../out/201027/tsne/"
pca_out_dir = "../../out/201027/pca/"

def joint_embed_tsne(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=5).fit_transform(jointed)
    jointed_em_2 = TSNE(n_components=2).fit_transform(pca_jointed)
    target_em = jointed_em_2[0:target.shape[0],:]
    background_em = jointed_em_2[target.shape[0]:,:]
    return target_em,background_em

def joint_embed_pca(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=2).fit_transform(jointed)
    target_em = pca_jointed[0:target.shape[0],:]
    background_em = pca_jointed[target.shape[0]:,:]
    return target_em,background_em

In [11]:
# make one directory for each tuning method
import os, sys
for model_dir in models_dir:
    os.mkdir(pca_out_dir+model_dir)
    os.mkdir(tsne_out_dir+model_dir)
    for tuning_dir_ in tuning_dir:
        pca_path = pca_out_dir+model_dir+tuning_dir_
        print(pca_path)
        os.mkdir(pca_path)
        tsne_path = tsne_out_dir+model_dir+tuning_dir_
        os.mkdir(tsne_path);

../../out/201027/pca/lstm5/evotune_balanced
../../out/201027/pca/lstm5/evotune_balanced_target
../../out/201027/pca/lstm5/mini_balanced
../../out/201027/pca/lstm5/mini_balanced_target
../../out/201027/pca/lstm5/raw
../../out/201027/pca/seq2seq/evotune_balanced
../../out/201027/pca/seq2seq/evotune_balanced_target
../../out/201027/pca/seq2seq/mini_balanced
../../out/201027/pca/seq2seq/mini_balanced_target
../../out/201027/pca/seq2seq/raw
../../out/201027/pca/transformer_encoder/evotune_balanced
../../out/201027/pca/transformer_encoder/evotune_balanced_target
../../out/201027/pca/transformer_encoder/mini_balanced
../../out/201027/pca/transformer_encoder/mini_balanced_target
../../out/201027/pca/transformer_encoder/raw


## TSNE/PCA 
- directly PCA to 2 dimensions, or PCA to 5 then TSNE
- individual embedding: each of the set ["motor_toolkit","pfamA_motors","pfamA_random","pfamA_target"]
- jointly embed everything, and everything but random, balanced_target, toolkit_target

In [None]:
import glob
# for model in models_dir:
import time 

model = models_dir[0]
print(model)
for tuning_dir_ in tuning_dir:
    print(tuning_dir_)
    dats = []
    for dat in data_dir:
        d_path = '../../out/201027/embedding/'+model+dat+'_'+tuning_dir_ + '.npy'
        print(d_path)
        d = np.load(d_path)
        dats.append(d)
    print(dats[0].shape)
    print(dats[1].shape)
    print(dats[2].shape)
    print(dats[3].shape)
    # first, embed each of the dataset themselves
    pca_res = []
    tsne_res = []
    for i, dat in enumerate(dats):
        print(data_dir[i])
        start_time = time.time()
        pca_curr = PCA(n_components=2).fit_transform(dat)
        pca_res.append(pca_curr)
        tsne_curr = PCA(n_components=5).fit_transform(dat)
        tsne_curr = TSNE(n_components=2).fit_transform(tsne_curr)
        tsne_res.append(tsne_curr)
        print("--- %s seconds ---" % (time.time() - start_time))
        
    for i,dat_name in enumerate(data_dir):
        pca_out_path = pca_out_dir+model+tuning_dir_+"/"+dat_name+".npy"
        tsne_out_path = tsne_out_dir+model+tuning_dir_+"/"+dat_name+".npy"
        print(pca_out_path)
        print(tsne_out_path)
        np.save(pca_out_path, pca_res[i])
        np.save(tsne_out_path, tsne_res[i])
    # then, perform several jointly embedding
    #1. target,random
    #2. balanced, random
    #3. toolkit,target
    for i,j in [(0,2),(1,2)]:
        start_time = time.time()
        target_tsne,background_tsne = joint_embed_tsne(dat[i],dat[j])
        target_pca,background_pca = joint_embed_pca(dat[i],dat[j])
        print("--- %s seconds ---" % (time.time() - start_time))
        pca_out_path_t = pca_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_target.npy"
        tsne_out_path_t = tsne_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_target.npy"
        pca_out_path_b = pca_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_background.npy"
        tsne_out_path_b = tsne_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_background.npy"
        print(pca_out_path_t)
        print(pca_out_path_b)
        print(tsne_out_path_t)
        print(tsne_out_path_b)
        np.save(pca_out_path_t, target_pca)
        np.save(pca_out_path_b, background_pca)
        np.save(tsne_out_path_t, target_tsne)
        np.save(tsne_out_path_b, background_tsne)

lstm5/
evotune_balanced
../../out/201027/embedding/lstm5/motor_toolkit_evotune_balanced.npy
../../out/201027/embedding/lstm5/pfamA_motors_balanced_evotune_balanced.npy
../../out/201027/embedding/lstm5/pfamA_random_evotune_balanced.npy
../../out/201027/embedding/lstm5/pfamA_target_evotune_balanced.npy
(3255, 256)
(18000, 256)
(1600, 256)
(59149, 256)
motor_toolkit
--- 66.47476363182068 seconds ---
pfamA_motors_balanced
--- 433.23306941986084 seconds ---
pfamA_random
--- 29.495662450790405 seconds ---
pfamA_target
