In [1]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

In [5]:
hn_dir = "../../out/201102/embedding/esm_models_normalized"
tuning_dir = ["t12_balanced","t12_kinesin","t12_motor_toolkit","t12","t34"]
tsne_out_dir = "../../out/201102/normalized/tsne/"
pca_out_dir = "../../out/201102/normalized/pca/"
data_names = ["pfamA_random","motor_toolkit","pfamA_balanced","pfamA_target","kinesin_labelled"]

In [6]:
def joint_embed_tsne(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=5).fit_transform(jointed)
    jointed_em_2 = TSNE(n_components=2).fit_transform(pca_jointed)
    target_em = jointed_em_2[0:target.shape[0],:]
    background_em = jointed_em_2[target.shape[0]:,:]
    return target_em,background_em

def joint_embed_pca(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=2).fit_transform(jointed)
    target_em = pca_jointed[0:target.shape[0],:]
    background_em = pca_jointed[target.shape[0]:,:]
    return target_em,background_em

In [7]:
# make one directory for each tuning method
import os, sys
for tuning_dir_ in tuning_dir:
    pca_path = pca_out_dir+tuning_dir_
    print(pca_path)
    os.mkdir(pca_path)
    tsne_path = tsne_out_dir+tuning_dir_
    os.mkdir(tsne_path)

../../out/201102/normalized/pca/t12_balanced
../../out/201102/normalized/pca/t12_kinesin
../../out/201102/normalized/pca/t12_motor_toolkit
../../out/201102/normalized/pca/t12
../../out/201102/normalized/pca/t34


In [8]:
import glob
# for model in models_dir:
import time 

for tuning_dir_ in tuning_dir:
    
#     tuning_dir_ = tuning_dir[4]
    print(tuning_dir_)
    dats = []
    for dat in data_names:
        d_path = hn_dir+'/'+tuning_dir_ +'/'+dat+ '.npy'
#         print(d_path)
        d = np.load(d_path)
        dats.append(d)
    print(dats[0].shape)
    print(dats[1].shape)
    print(dats[2].shape)
    print(dats[3].shape)
    
    # first, embed each of the dataset themselves
    pca_res = []
    tsne_res = []
    for i, dat in enumerate(dats):
        print(data_names[i])
        start_time = time.time()
        pca_curr = PCA(n_components=2).fit_transform(dat)
        pca_res.append(pca_curr)
        tsne_curr = PCA(n_components=10).fit_transform(dat)
        tsne_curr = TSNE(n_components=2).fit_transform(tsne_curr)
        tsne_res.append(tsne_curr)
        print("--- %s seconds ---" % (time.time() - start_time))
  
        
    for i,dat_name in enumerate(data_names):
        pca_out_path = pca_out_dir + tuning_dir_+"/"+dat_name+".npy"
        tsne_out_path = tsne_out_dir + tuning_dir_+"/"+dat_name+".npy"
        print(pca_out_path)
        print(tsne_out_path)
        np.save(pca_out_path, pca_res[i])
        np.save(tsne_out_path, tsne_res[i])
    
    # then, perform several jointly embedding
    #1. target,random
    #2. balanced, random
    for i,j in [(2,0),(3,0)]:
        start_time = time.time()
        target_tsne,background_tsne = joint_embed_tsne(dats[i],dats[j])
        target_pca,background_pca = joint_embed_pca(dats[i],dats[j])
        print("--- %s seconds ---" % (time.time() - start_time))
        pca_out_path_t = pca_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_target.npy"
        tsne_out_path_t = tsne_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_target.npy"
        pca_out_path_b = pca_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_background.npy"
        tsne_out_path_b = tsne_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_background.npy"
        print(pca_out_path_t)
        print(pca_out_path_b)
        print(tsne_out_path_t)
        print(tsne_out_path_b)
        np.save(pca_out_path_t, target_pca)
        np.save(pca_out_path_b, background_pca)
        np.save(tsne_out_path_t, target_tsne)
        np.save(tsne_out_path_b, background_tsne)
#     break


t12_balanced
(1600, 768)
(3255, 768)
(18000, 768)
(5544, 768)
pfamA_random
--- 28.558871746063232 seconds ---
motor_toolkit
--- 66.02222776412964 seconds ---
pfamA_balanced
--- 395.006564617157 seconds ---
pfamA_target
--- 110.27404570579529 seconds ---
kinesin_labelled
--- 20.00857901573181 seconds ---
../../out/201102/normalized/pca/t12_balanced/pfamA_random.npy
../../out/201102/normalized/tsne/t12_balanced/pfamA_random.npy
../../out/201102/normalized/pca/t12_balanced/motor_toolkit.npy
../../out/201102/normalized/tsne/t12_balanced/motor_toolkit.npy
../../out/201102/normalized/pca/t12_balanced/pfamA_balanced.npy
../../out/201102/normalized/tsne/t12_balanced/pfamA_balanced.npy
../../out/201102/normalized/pca/t12_balanced/pfamA_target.npy
../../out/201102/normalized/tsne/t12_balanced/pfamA_target.npy
../../out/201102/normalized/pca/t12_balanced/kinesin_labelled.npy
../../out/201102/normalized/tsne/t12_balanced/kinesin_labelled.npy
--- 442.8171474933624 seconds ---
../../out/201102/norma