In [1]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

In [16]:
pfamA_target_sub = pd.read_csv("../../data/pfamA_target_sub.csv")
pfamA_target_sub.head()
pfamA_target_sub_ind = pfamA_target_sub.iloc[:,0]
pfamA_target_sub_ind.head()

0    31815
1    57446
2    52009
3    56752
4    27603
Name: Unnamed: 0, dtype: int64

In [17]:
motor_toolkit_short = pd.read_csv("../../data/motor_toolkit_short.csv")
motor_toolkit_short.head()
motor_toolkit_short_ind = motor_toolkit_short.iloc[:,0]
motor_toolkit_short_ind.head()

0    0
1    1
2    2
3    3
4    4
Name: Unnamed: 0, dtype: int64

In [18]:
hn_dir = "../../out/201027/embedding/"
models_dir  = ["lstm5/","seq2seq/", "transformer_encoder/"]
data_dir = ["motor_toolkit","pfamA_motors_balanced","pfamA_random","pfamA_target"]
tuning_dir = ["evotune_balanced","evotune_balanced_target","mini_balanced","mini_balanced_target","raw"]
tsne_out_dir = "../../out/201027/tsne/"
pca_out_dir = "../../out/201027/pca/"

def joint_embed_tsne(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=5).fit_transform(jointed)
    jointed_em_2 = TSNE(n_components=2).fit_transform(pca_jointed)
    target_em = jointed_em_2[0:target.shape[0],:]
    background_em = jointed_em_2[target.shape[0]:,:]
    return target_em,background_em

def joint_embed_pca(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=2).fit_transform(jointed)
    target_em = pca_jointed[0:target.shape[0],:]
    background_em = pca_jointed[target.shape[0]:,:]
    return target_em,background_em

In [19]:
# make one directory for each tuning method
# import os, sys
# for model_dir in models_dir:
#     os.mkdir(pca_out_dir+model_dir)
#     os.mkdir(tsne_out_dir+model_dir)
#     for tuning_dir_ in tuning_dir:
#         pca_path = pca_out_dir+model_dir+tuning_dir_
#         print(pca_path)
#         os.mkdir(pca_path)
#         tsne_path = tsne_out_dir+model_dir+tuning_dir_
#         os.mkdir(tsne_path);

## TSNE/PCA 
- directly PCA to 2 dimensions, or PCA to 5 then TSNE
- individual embedding: each of the set ["motor_toolkit","pfamA_motors","pfamA_random","pfamA_target"]
- jointly embed everything, and everything but random, balanced_target, toolkit_target

In [21]:
import glob
# for model in models_dir:
import time 

model = models_dir[1]
print(model)
for tuning_dir_ in tuning_dir:
    print(tuning_dir_)
    dats = []
    for dat in data_dir:
        d_path = '../../out/201027/embedding/'+model+dat+'_'+tuning_dir_ + '.npy'
        print(d_path)
        d = np.load(d_path)
        if dat == 'motor_toolkit':
            d = d[motor_toolkit_short_ind,:]
        elif dat == 'pfamA_target':
            d = d[pfamA_target_sub_ind,:]
        dats.append(d)
    print(dats[0].shape)
    print(dats[1].shape)
    print(dats[2].shape)
    print(dats[3].shape)
    # first, embed each of the dataset themselves
    pca_res = []
    tsne_res = []
    for i, dat in enumerate(dats):
        print(data_dir[i])
        start_time = time.time()
        pca_curr = PCA(n_components=2).fit_transform(dat)
        pca_res.append(pca_curr)
        tsne_curr = PCA(n_components=5).fit_transform(dat)
        tsne_curr = TSNE(n_components=2).fit_transform(tsne_curr)
        tsne_res.append(tsne_curr)
        print("--- %s seconds ---" % (time.time() - start_time))
        
    for i,dat_name in enumerate(data_dir):
        pca_out_path = pca_out_dir+model+tuning_dir_+"/"+dat_name+".npy"
        tsne_out_path = tsne_out_dir+model+tuning_dir_+"/"+dat_name+".npy"
        print(pca_out_path)
        print(tsne_out_path)
        np.save(pca_out_path, pca_res[i])
        np.save(tsne_out_path, tsne_res[i])
    # then, perform several jointly embedding
    #1. target,random
    #2. balanced, random
    #3. toolkit,target
    for i,j in [(0,2),(1,2)]:
        start_time = time.time()
        target_tsne,background_tsne = joint_embed_tsne(dats[i],dats[j])
        target_pca,background_pca = joint_embed_pca(dats[i],dats[j])
        print("--- %s seconds ---" % (time.time() - start_time))
        pca_out_path_t = pca_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_target.npy"
        tsne_out_path_t = tsne_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_target.npy"
        pca_out_path_b = pca_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_background.npy"
        tsne_out_path_b = tsne_out_dir+model+tuning_dir_+"/"+data_dir[i]+"_"+data_dir[j]+"_background.npy"
        print(pca_out_path_t)
        print(pca_out_path_b)
        print(tsne_out_path_t)
        print(tsne_out_path_b)
        np.save(pca_out_path_t, target_pca)
        np.save(pca_out_path_b, background_pca)
        np.save(tsne_out_path_t, target_tsne)
        np.save(tsne_out_path_b, background_tsne)

seq2seq/
evotune_balanced
../../out/201027/embedding/seq2seq/motor_toolkit_evotune_balanced.npy
../../out/201027/embedding/seq2seq/pfamA_motors_balanced_evotune_balanced.npy
../../out/201027/embedding/seq2seq/pfamA_random_evotune_balanced.npy
../../out/201027/embedding/seq2seq/pfamA_target_evotune_balanced.npy
(3235, 128)
(18000, 128)
(1600, 128)
(5544, 128)
motor_toolkit
--- 66.95456576347351 seconds ---
pfamA_motors_balanced
--- 461.94230914115906 seconds ---
pfamA_random
--- 33.632333755493164 seconds ---
pfamA_target
--- 128.13106966018677 seconds ---
../../out/201027/pca/seq2seq/evotune_balanced/motor_toolkit.npy
../../out/201027/tsne/seq2seq/evotune_balanced/motor_toolkit.npy
../../out/201027/pca/seq2seq/evotune_balanced/pfamA_motors_balanced.npy
../../out/201027/tsne/seq2seq/evotune_balanced/pfamA_motors_balanced.npy
../../out/201027/pca/seq2seq/evotune_balanced/pfamA_random.npy
../../out/201027/tsne/seq2seq/evotune_balanced/pfamA_random.npy
../../out/201027/pca/seq2seq/evotune_

--- 105.43912553787231 seconds ---
../../out/201027/pca/seq2seq/raw/motor_toolkit_pfamA_random_target.npy
../../out/201027/pca/seq2seq/raw/motor_toolkit_pfamA_random_background.npy
../../out/201027/tsne/seq2seq/raw/motor_toolkit_pfamA_random_target.npy
../../out/201027/tsne/seq2seq/raw/motor_toolkit_pfamA_random_background.npy
--- 454.3562297821045 seconds ---
../../out/201027/pca/seq2seq/raw/pfamA_motors_balanced_pfamA_random_target.npy
../../out/201027/pca/seq2seq/raw/pfamA_motors_balanced_pfamA_random_background.npy
../../out/201027/tsne/seq2seq/raw/pfamA_motors_balanced_pfamA_random_target.npy
../../out/201027/tsne/seq2seq/raw/pfamA_motors_balanced_pfamA_random_background.npy
