In [19]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.decomposition import PCA
import umap

%matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [20]:
hn_dir = "../../out/201102/embedding/esm_models"
tuning_dir = ["t12","t34"]
umap_out_dir = "../../out/201102/umap/"
data_names = ["pfamA_random","motor_toolkit","pfamA_balanced","pfamA_target","kinesin_labelled"]

In [21]:
def joint_embed_tsne(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=5).fit_transform(jointed)
    jointed_em_2 = TSNE(n_components=2).fit_transform(pca_jointed)
    target_em = jointed_em_2[0:target.shape[0],:]
    background_em = jointed_em_2[target.shape[0]:,:]
    return target_em,background_em

def joint_embed_pca(target,background):
    jointed = np.concatenate((target, background), axis=0)
    pca_jointed = PCA(n_components=2).fit_transform(jointed)
    target_em = pca_jointed[0:target.shape[0],:]
    background_em = pca_jointed[target.shape[0]:,:]
    return target_em,background_em

In [23]:
# make one directory for each tuning method
import os, sys
for tuning_dir_ in tuning_dir:
    umap_path = umap_out_dir+tuning_dir_
    print(umap_path)
    os.mkdir(umap_path)


../../out/201102/umap/t12
../../out/201102/umap/t34


In [25]:
import glob
# for model in models_dir:
import time 

for tuning_dir_ in tuning_dir:
#     tuning_dir_ = tuning_dir[4]
    print(tuning_dir_)
    dats = []
    for dat in data_names:
        d_path = '../../out/201102/embedding/esm_models/' + tuning_dir_ +'/'+ dat + '.npy'
#         print(d_path)
        d = np.load(d_path)
        dats.append(d)
    print(dats[0].shape)
    print(dats[1].shape)
    print(dats[2].shape)
    print(dats[3].shape)
    
    # first, embed each of the dataset themselves
    umap_res = []
    for i, dat in enumerate(dats):
        print(data_names[i])
        start_time = time.time()
        scaled_data = StandardScaler().fit_transform(dat)
        pca_curr = PCA(n_components=40).fit_transform(scaled_data)
        reducer = umap.UMAP()
        umap_curr = reducer.fit_transform(pca_curr)
        umap_res.append(umap_curr)
        print("--- %s seconds ---" % (time.time() - start_time))
        
    for i,dat_name in enumerate(data_names):
        umap_out_path = umap_out_dir + tuning_dir_+"/"+dat_name+".npy"
        print(umap_out_path)
        np.save(umap_out_path, umap_res[i])
    
    # then, perform several jointly embedding
    # 1. target,random
    # 2. balanced, random
#     for i,j in [(2,0),(3,0)]:
#         start_time = time.time()
#         target_tsne,background_tsne = joint_embed_tsne(dats[i],dats[j])
#         target_pca,background_pca = joint_embed_pca(dats[i],dats[j])
#         print("--- %s seconds ---" % (time.time() - start_time))
#         pca_out_path_t = pca_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_target.npy"
#         tsne_out_path_t = tsne_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_target.npy"
#         pca_out_path_b = pca_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_background.npy"
#         tsne_out_path_b = tsne_out_dir+tuning_dir_+"/"+data_names[i]+"_"+data_names[j]+"_background.npy"
#         print(pca_out_path_t)
#         print(pca_out_path_b)
#         print(tsne_out_path_t)
#         print(tsne_out_path_b)
#         np.save(pca_out_path_t, target_pca)
#         np.save(pca_out_path_b, background_pca)
#         np.save(tsne_out_path_t, target_tsne)
#         np.save(tsne_out_path_b, background_tsne)
#     break


t12
(1600, 768)
(3255, 768)
(18000, 768)
(5544, 768)
pfamA_random
--- 4.850423097610474 seconds ---
motor_toolkit




--- 10.232107400894165 seconds ---
pfamA_balanced
--- 18.93717336654663 seconds ---
pfamA_target
--- 8.913695335388184 seconds ---
kinesin_labelled
--- 3.6831319332122803 seconds ---
../../out/201102/umap/t12/pfamA_random.npy
../../out/201102/umap/t12/motor_toolkit.npy
../../out/201102/umap/t12/pfamA_balanced.npy
../../out/201102/umap/t12/pfamA_target.npy
../../out/201102/umap/t12/kinesin_labelled.npy
t34
(1600, 1280)
(3255, 1280)
(18000, 1280)
(5544, 1280)
pfamA_random




--- 5.27403998374939 seconds ---
motor_toolkit




--- 11.427436351776123 seconds ---
pfamA_balanced
--- 13.91210150718689 seconds ---
pfamA_target
--- 9.452322959899902 seconds ---
kinesin_labelled
--- 4.002966642379761 seconds ---
../../out/201102/umap/t34/pfamA_random.npy
../../out/201102/umap/t34/motor_toolkit.npy
../../out/201102/umap/t34/pfamA_balanced.npy
../../out/201102/umap/t34/pfamA_target.npy
../../out/201102/umap/t34/kinesin_labelled.npy


In [26]:
a = np.load("../../out/201102/umap/t34/pfamA_random.npy")

In [29]:
a[0:5,:]

array([[ -8.854622 ,   5.582622 ],
       [ -8.075272 ,   3.857239 ],
       [-11.218452 ,   4.8931537],
       [ -7.5493946,   3.8694296],
       [-11.237478 ,   4.8996315]], dtype=float32)